Beispiel #1
0
def _discover_subdatasets_recursively(
        discovered, top, trace, recursion_limit):
    # this beast walks the directory tree from a give `top` directory
    # and discovers valid repos that are scattered around, regardless
    # of whether they are already subdatasets or not
    # `trace` must be a list that has at least one element (the base
    # dataset)
    if recursion_limit is not None and len(trace) > recursion_limit:
        return
    if not isdir(top):
        return
    if not op.islink(top) and GitRepo.is_valid_repo(top):
        if top in discovered:
            # this was found already, assume everything beneath it too
            return
        discovered[top] = dict(
            path=top,
            # and its content
            process_content=True,
            type='dataset',
            parentds=trace[-1])
        # new node in the trace down
        trace = trace + [top]
    for path in listdir(top):
        path = opj(top, path)
        if not isdir(path):
            continue
        # next level down
        _discover_subdatasets_recursively(
            discovered, path, trace, recursion_limit)
Beispiel #2
0
def _parse_git_submodules(ds, paths):
    """All known ones with some properties"""
    if not (ds.pathobj / ".gitmodules").exists():
        # easy way out. if there is no .gitmodules file
        # we cannot have (functional) subdatasets
        return

    if paths:
        paths = [
            p.relative_to(ds.pathobj) for p in paths
            if ds.pathobj == p or ds.pathobj in p.parents
        ]
        if not paths:
            # we had path contraints, but none matched this dataset
            return
    for path, props in iteritems(
            ds.repo.get_content_info(paths=paths,
                                     ref=None,
                                     untracked='no',
                                     eval_file_type=False)):
        if props.get('type', None) != 'dataset':
            continue
        if ds.pathobj != ds.repo.pathobj:
            props['path'] = ds.pathobj / path.relative_to(ds.repo.pathobj)
        else:
            props['path'] = path
        if not path.exists() or not GitRepo.is_valid_repo(text_type(path)):
            props['state'] = 'absent'
        # TODO kill this after some time. We used to do custom things here
        # and gitshasum was called revision. Be nice and duplicate for a bit
        # wipe out when patience is gone
        props['revision'] = props['gitshasum']
        yield props
Beispiel #3
0
def _discover_subdatasets_recursively(discovered, top, trace, recursion_limit):
    # this beast walks the directory tree from a give `top` directory
    # and discovers valid repos that are scattered around, regardless
    # of whether they are already subdatasets or not
    # `trace` must be a list that has at least one element (the base
    # dataset)
    if recursion_limit is not None and len(trace) > recursion_limit:
        return
    if not isdir(top):
        return
    if not op.islink(top) and GitRepo.is_valid_repo(top):
        if top in discovered:
            # this was found already, assume everything beneath it too
            return
        discovered[top] = dict(
            path=top,
            # and its content
            process_content=True,
            type='dataset',
            parentds=trace[-1])
        # new node in the trace down
        trace = trace + [top]
    for path in listdir(top):
        path = opj(top, path)
        if not isdir(path):
            continue
        # next level down
        _discover_subdatasets_recursively(discovered, path, trace,
                                          recursion_limit)
Beispiel #4
0
def test_submodule_deinit(path):
    from datalad.support.annexrepo import AnnexRepo

    top_repo = AnnexRepo(path, create=False)
    eq_({'subm 1', '2'}, {s.name for s in top_repo.get_submodules()})
    # note: here init=True is ok, since we are using it just for testing
    with swallow_logs(new_level=logging.WARN) as cml:
        top_repo.update_submodule('subm 1', init=True)
        assert_in('Do not use update_submodule with init=True', cml.out)
    top_repo.update_submodule('2', init=True)

    # ok_(all([s.module_exists() for s in top_repo.get_submodules()]))
    # TODO: old assertion above if non-bare? (can't use "direct mode" in test_gitrepo)
    # Alternatively: New testrepo (plain git submodules) and have a dedicated
    # test for annexes in addition
    ok_(all([GitRepo.is_valid_repo(op.join(top_repo.path, s.path))
             for s in top_repo.get_submodules()]))

    # modify submodule:
    with open(op.join(top_repo.path, 'subm 1', 'file_ut.dat'), "w") as f:
        f.write("some content")

    assert_raises(CommandError, top_repo.deinit_submodule, 'sub1')

    # using force should work:
    top_repo.deinit_submodule('subm 1', force=True)

    ok_(not top_repo.repo.submodule('subm 1').module_exists())
Beispiel #5
0
def test_install_simple_local(src, path):
    origin = Dataset(path)

    # now install it somewhere else
    ds = install(path, source=src, description='mydummy')
    eq_(ds.path, path)
    ok_(ds.is_installed())
    if not isinstance(origin.repo, AnnexRepo):
        # this means it is a GitRepo
        ok_(isinstance(origin.repo, GitRepo))
        # stays plain Git repo
        ok_(isinstance(ds.repo, GitRepo))
        ok_(not isinstance(ds.repo, AnnexRepo))
        ok_(GitRepo.is_valid_repo(ds.path))
        eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt'})
        ok_clean_git(path, annex=False)
    else:
        # must be an annex
        ok_(isinstance(ds.repo, AnnexRepo))
        ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False))
        eq_(set(ds.repo.get_indexed_files()),
            {'test.dat', 'INFO.txt', 'test-annex.dat'})
        ok_clean_git(path, annex=True)
        # no content was installed:
        ok_(not ds.repo.file_has_content('test-annex.dat'))
        uuid_before = ds.repo.uuid
        eq_(ds.repo.get_description(), 'mydummy')

    # installing it again, shouldn't matter:
    res = install(path, source=src, result_xfm=None, return_type='list')
    assert_status('notneeded', res)
    ok_(ds.is_installed())
    if isinstance(origin.repo, AnnexRepo):
        eq_(uuid_before, ds.repo.uuid)
Beispiel #6
0
def test_install_dataset_from_just_source(src_repo=None, path=None):

    src_ds = Dataset(src_repo).create(result_renderer='disabled', force=True)
    src_ds.save(['INFO.txt', 'test.dat'], to_git=True)
    src_ds.save('test-annex.dat', to_git=False)
    # equivalent repo on github:
    src_url = "https://github.com/datalad/testrepo--basic--r1.git"
    sources = [
        src_ds.path,
        get_local_file_url(src_ds.path, compatibility='git')
    ]
    if not dl_cfg.get('datalad.tests.nonetwork'):
        sources.append(src_url)

    for url in sources:

        with chpwd(path, mkdir=True):
            ds = install(source=url)

        ok_startswith(ds.path, path)
        ok_(ds.is_installed())
        ok_(GitRepo.is_valid_repo(ds.path))
        assert_repo_status(ds.path, annex=None)
        assert_in('INFO.txt', ds.repo.get_indexed_files())

        # cleanup before next iteration
        rmtree(path)
Beispiel #7
0
def test_install_simple_local(src, path):
    origin = Dataset(path)

    # now install it somewhere else
    ds = install(path, source=src, description='mydummy')
    eq_(ds.path, path)
    ok_(ds.is_installed())
    if not isinstance(origin.repo, AnnexRepo):
        # this means it is a GitRepo
        ok_(isinstance(origin.repo, GitRepo))
        # stays plain Git repo
        ok_(isinstance(ds.repo, GitRepo))
        ok_(not isinstance(ds.repo, AnnexRepo))
        ok_(GitRepo.is_valid_repo(ds.path))
        eq_(set(ds.repo.get_indexed_files()),
            {'test.dat', 'INFO.txt'})
        ok_clean_git(path, annex=False)
    else:
        # must be an annex
        ok_(isinstance(ds.repo, AnnexRepo))
        ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False))
        eq_(set(ds.repo.get_indexed_files()),
            {'test.dat', 'INFO.txt', 'test-annex.dat'})
        ok_clean_git(path, annex=True)
        # no content was installed:
        ok_(not ds.repo.file_has_content('test-annex.dat'))
        uuid_before = ds.repo.uuid
        eq_(ds.repo.get_description(), 'mydummy')

    # installing it again, shouldn't matter:
    res = install(path, source=src, result_xfm=None, return_type='list')
    assert_status('notneeded', res)
    ok_(ds.is_installed())
    if isinstance(origin.repo, AnnexRepo):
        eq_(uuid_before, ds.repo.uuid)
Beispiel #8
0
def _parse_git_submodules(ds, paths):
    """All known ones with some properties"""
    if not (ds.pathobj / ".gitmodules").exists():
        # easy way out. if there is no .gitmodules file
        # we cannot have (functional) subdatasets
        return

    if paths:
        paths = [
            p.relative_to(ds.pathobj)
            for p in paths
            if ds.pathobj == p or ds.pathobj in p.parents]
        if not paths:
            # we had path contraints, but none matched this dataset
            return
    for path, props in iteritems(ds.repo.get_content_info(
            paths=paths,
            ref=None,
            untracked='no',
            eval_file_type=False)):
        if props.get('type', None) != 'dataset':
            continue
        if ds.pathobj != ds.repo.pathobj:
            props['path'] = ds.pathobj / path.relative_to(ds.repo.pathobj)
        else:
            props['path'] = path
        if not path.exists() or not GitRepo.is_valid_repo(text_type(path)):
            props['state'] = 'absent'
        # TODO kill this after some time. We used to do custom things here
        # and gitshasum was called revision. Be nice and duplicate for a bit
        # wipe out when patience is gone
        props['revision'] = props['gitshasum']
        yield props
Beispiel #9
0
def test_submodule_deinit(path):
    from datalad.support.annexrepo import AnnexRepo

    top_repo = AnnexRepo(path, create=False)
    eq_({'subm 1', '2'}, {s.name for s in top_repo.get_submodules()})
    # note: here init=True is ok, since we are using it just for testing
    with swallow_logs(new_level=logging.WARN) as cml:
        top_repo.update_submodule('subm 1', init=True)
        assert_in('Do not use update_submodule with init=True', cml.out)
    top_repo.update_submodule('2', init=True)

    # ok_(all([s.module_exists() for s in top_repo.get_submodules()]))
    # TODO: old assertion above if non-bare? (can't use "direct mode" in test_gitrepo)
    # Alternatively: New testrepo (plain git submodules) and have a dedicated
    # test for annexes in addition
    ok_(
        all([
            GitRepo.is_valid_repo(op.join(top_repo.path, s.path))
            for s in top_repo.get_submodules()
        ]))

    # modify submodule:
    with open(op.join(top_repo.path, 'subm 1', 'file_ut.dat'), "w") as f:
        f.write("some content")

    assert_raises(CommandError, top_repo.deinit_submodule, 'sub1')

    # using force should work:
    top_repo.deinit_submodule('subm 1', force=True)

    ok_(not top_repo.repo.submodule('subm 1').module_exists())
Beispiel #10
0
def _parse_git_submodules(ds_pathobj, repo, paths):
    """All known ones with some properties"""
    if not (ds_pathobj / ".gitmodules").exists():
        # easy way out. if there is no .gitmodules file
        # we cannot have (functional) subdatasets
        return

    if paths:
        paths_outside, paths_at_or_in = partition(
            paths, lambda p: ds_pathobj == p or ds_pathobj in p.parents)
        paths = [p.relative_to(ds_pathobj) for p in paths_at_or_in]
        if not paths:
            if any(p for p in paths_outside if p in ds_pathobj.parents):
                # The dataset is directly under some specified path, so include
                # it.
                paths = None
            else:
                # we had path contraints, but none matched this dataset
                return
    for props in repo.get_submodules_(paths=paths):
        path = props["path"]
        if props.get('type', None) != 'dataset':
            continue
        if ds_pathobj != repo.pathobj:
            props['path'] = ds_pathobj / path.relative_to(repo.pathobj)
        else:
            props['path'] = path
        if not path.exists() or not GitRepo.is_valid_repo(str(path)):
            props['state'] = 'absent'
        # TODO kill this after some time. We used to do custom things here
        # and gitshasum was called revision. Be nice and duplicate for a bit
        # wipe out when patience is gone
        props['revision'] = props['gitshasum']
        yield props
Beispiel #11
0
def _install_necessary_subdatasets(
        ds, path, reckless, refds_path, description=None):
    """Installs subdatasets of `ds`, that are necessary to obtain in order
    to have access to `path`.

    Gets the subdataset containing `path` regardless of whether or not it was
    already installed. While doing so, installs everything necessary in between
    the uppermost installed one and `path`.

    Note: `ds` itself has to be installed.

    Parameters
    ----------
    ds: Dataset
    path: str
    reckless: bool
    """
    # figuring out what dataset to start with, --contains limits --recursive
    # to visit only subdataset on the trajectory to the target path
    subds_trail = ds.subdatasets(contains=path, recursive=True)
    if not subds_trail:
        # there is not a single known subdataset (installed or not)
        # for this path -- job done
        return
    # otherwise we start with the one deepest down
    cur_subds = subds_trail[-1]

    while not GitRepo.is_valid_repo(cur_subds['path']):
        # install using helper that give some flexibility regarding where to
        # get the module from
        try:
            sd = _install_subds_from_flexible_source(
                Dataset(cur_subds['parentds']),
                relpath(cur_subds['path'], start=cur_subds['parentds']),
                cur_subds['gitmodule_url'],
                reckless,
                description=description)
        except Exception as e:
            # skip all of downstairs, if we didn't manage to install subdataset
            yield get_status_dict(
                'install', path=cur_subds['path'], type='dataset',
                status='error', logger=lgr, refds=refds_path,
                message=("Installation of subdatasets %s failed with exception: %s",
                         cur_subds['path'], exc_str(e)))
            return

        # report installation, whether it helped or not
        yield get_status_dict(
            'install', ds=sd, status='ok', logger=lgr, refds=refds_path,
            message=("Installed subdataset in order to get %s", path))

        # now check whether the just installed subds brought us any closer to
        # the target path
        subds_trail = sd.subdatasets(contains=path, recursive=False)
        if not subds_trail:
            # no (newly available) subdataset get's us any closer
            return
        # next round
        cur_subds = subds_trail[-1]
Beispiel #12
0
def _install_necessary_subdatasets(
        ds, path, reckless, refds_path, description=None):
    """Installs subdatasets of `ds`, that are necessary to obtain in order
    to have access to `path`.

    Gets the subdataset containing `path` regardless of whether or not it was
    already installed. While doing so, installs everything necessary in between
    the uppermost installed one and `path`.

    Note: `ds` itself has to be installed.

    Parameters
    ----------
    ds: Dataset
    path: str
    reckless: bool
    """
    # figuring out what dataset to start with, --contains limits --recursive
    # to visit only subdataset on the trajectory to the target path
    subds_trail = ds.subdatasets(contains=path, recursive=True)
    if not subds_trail:
        # there is not a single known subdataset (installed or not)
        # for this path -- job done
        return
    # otherwise we start with the one deepest down
    cur_subds = subds_trail[-1]

    while not GitRepo.is_valid_repo(cur_subds['path']):
        # install using helper that give some flexibility regarding where to
        # get the module from
        try:
            sd = _install_subds_from_flexible_source(
                Dataset(cur_subds['parentds']),
                relpath(cur_subds['path'], start=cur_subds['parentds']),
                cur_subds['gitmodule_url'],
                reckless,
                description=description)
        except Exception as e:
            # skip all of downstairs, if we didn't manage to install subdataset
            yield get_status_dict(
                'install', path=cur_subds['path'], type='dataset',
                status='error', logger=lgr, refds=refds_path,
                message=("Installation of subdatasets %s failed with exception: %s",
                         cur_subds['path'], exc_str(e)))
            return

        # report installation, whether it helped or not
        yield get_status_dict(
            'install', ds=sd, status='ok', logger=lgr, refds=refds_path,
            message=("Installed subdataset in order to get %s", path))

        # now check whether the just installed subds brought us any closer to
        # the target path
        subds_trail = sd.subdatasets(contains=path, recursive=False)
        if not subds_trail:
            # no (newly available) subdataset get's us any closer
            return
        # next round
        cur_subds = subds_trail[-1]
Beispiel #13
0
def download_dataset(repo=default_testing_repo,
                     remote_path=None,
                     local_folder=None):
    """
    Download a dataset with datalad client.

    By default it download the "NeuralEnsemble/ephy_testing_data" on gin platform
    which is used for neo testing.

    Usage:

        download_dataset(
                repo='https://gin.g-node.org/NeuralEnsemble/ephy_testing_data',
                remote_path='blackrock/blackrock_2_1',
                local_folder='/home/myname/Documents/')

    Parameters
    ----------
    repo: str
        The url of the repo.
        If None then 'https://gin.g-node.org/NeuralEnsemble/ephy_testing_data'
        is used
    remote_path: str of Path
        The distant path to retrieve (file or folder)
    local_folder: str or Path or None
        The local folder where to download the data.
        If None, a default project testing folder is used. Default: None

    Returns:
    --------
    local_path:
        The local path of the downloaded file or folder
    """
    assert HAVE_DATALAD, 'You need to install datalad'

    if local_folder is None:
        global local_testing_data_folder
        local_folder = local_testing_data_folder
    local_folder = Path(local_folder)

    if local_folder.exists() and GitRepo.is_valid_repo(local_folder):
        dataset = datalad.api.Dataset(path=local_folder)
        # make sure git repo is in clean state
        repo = dataset.repo
        repo.call_git(['checkout', '--force', 'master'])
        dataset.update(merge=True)
    else:
        dataset = datalad.api.install(path=local_folder, source=repo)

    if remote_path is None:
        print('Bad boy: you have to provide "remote_path"')
        return

    dataset.get(remote_path)

    local_path = local_folder / remote_path

    return local_path
Beispiel #14
0
def test_install_dataset_from_just_source(url, path):
    with chpwd(path, mkdir=True):
        ds = install(source=url)

    ok_startswith(ds.path, path)
    ok_(ds.is_installed())
    ok_(GitRepo.is_valid_repo(ds.path))
    ok_clean_git(ds.path, annex=None)
    assert_in('INFO.txt', ds.repo.get_indexed_files())
Beispiel #15
0
def discover_dataset_trace_to_targets(basepath, targetpaths, current_trace,
                                      spec):
    """Discover the edges and nodes in a dataset tree to given target paths

    Parameters
    ----------
    basepath : path
      Path to a start or top-level dataset. Really has to be a path to a
      dataset!
    targetpaths : list(path)
      Any non-zero number of path that are termination points for the
      search algorithm. Can be paths to datasets, directories, or files
      (and any combination thereof).
    current_trace : list
      For a top-level call this should probably always be `[]`
    spec : dict
      `content_by_ds`-style dictionary that will receive information about the
      discovered datasets. Specifically, for each discovered dataset there
      will be in item with its path under the key (path) of the respective
      superdataset.

    Returns
    -------
    None
      Function calls itself recursively and populates `spec` in-place.
    """
    # this beast walks the directory tree from a given `basepath` until
    # it discovers any of the given `targetpaths`
    # if it finds one, it commits any accummulated trace of visited
    # datasets on this edge to the spec
    valid_repo = GitRepo.is_valid_repo(basepath)
    if valid_repo:
        # we are passing into a new dataset, extend the dataset trace
        current_trace = current_trace + [basepath]
    if basepath in targetpaths:
        # found a targetpath, commit the trace
        for i, p in enumerate(current_trace[:-1]):
            # TODO RF prepare proper annotated path dicts
            spec[p] = list(set(spec.get(p, []) + [current_trace[i + 1]]))
    if not isdir(basepath):
        # nothing underneath this one -> done
        return
    # this edge is not done, we need to try to reach any downstream
    # dataset
    for p in listdir(basepath):
        if valid_repo and p == '.git':
            # ignore gitdir to speed things up
            continue
        p = opj(basepath, p)
        if all(t != p and not t.startswith(_with_sep(p)) for t in targetpaths):
            # OPT listdir might be large and we could have only few items
            # in `targetpaths` -- so traverse only those in spec which have
            # leading dir basepath
            continue
        # we need to call this even for non-directories, to be able to match
        # file target paths
        discover_dataset_trace_to_targets(p, targetpaths, current_trace, spec)
Beispiel #16
0
def test_install_dataset_from_just_source(url, path):
    with chpwd(path, mkdir=True):
        ds = install(source=url)

    ok_startswith(ds.path, path)
    ok_(ds.is_installed())
    ok_(GitRepo.is_valid_repo(ds.path))
    ok_clean_git(ds.path, annex=None)
    assert_in('INFO.txt', ds.repo.get_indexed_files())
Beispiel #17
0
def test_clone_dataset_from_just_source(url, path):
    with chpwd(path, mkdir=True):
        ds = clone(url, result_xfm='datasets', return_type='item-or-list')

    ok_startswith(ds.path, path)
    ok_(ds.is_installed())
    ok_(GitRepo.is_valid_repo(ds.path))
    ok_clean_git(ds.path, annex=None)
    assert_in('INFO.txt', ds.repo.get_indexed_files())
Beispiel #18
0
def test_clone_dataset_from_just_source(url, path):
    with chpwd(path, mkdir=True):
        ds = clone(url, result_xfm='datasets', return_type='item-or-list')

    ok_startswith(ds.path, path)
    ok_(ds.is_installed())
    ok_(GitRepo.is_valid_repo(ds.path))
    ok_clean_git(ds.path, annex=None)
    assert_in('INFO.txt', ds.repo.get_indexed_files())
Beispiel #19
0
def test_install_dataset_from_instance(src, dst):
    origin = Dataset(src)
    clone = install(source=origin, path=dst)

    assert_is_instance(clone, Dataset)
    ok_startswith(clone.path, dst)
    ok_(clone.is_installed())
    ok_(GitRepo.is_valid_repo(clone.path))
    ok_clean_git(clone.path, annex=None)
    assert_in('INFO.txt', clone.repo.get_indexed_files())
Beispiel #20
0
def test_install_dataset_from_instance(src, dst):
    origin = Dataset(src)
    clone = install(source=origin, path=dst)

    assert_is_instance(clone, Dataset)
    ok_startswith(clone.path, dst)
    ok_(clone.is_installed())
    ok_(GitRepo.is_valid_repo(clone.path))
    ok_clean_git(clone.path, annex=None)
    assert_in('INFO.txt', clone.repo.get_indexed_files())
Beispiel #21
0
    def __call__(path=None,
                 dataset=None,
                 fulfilled=None,
                 recursive=False,
                 recursion_limit=None,
                 contains=None,
                 bottomup=False,
                 set_property=None,
                 delete_property=None):
        # no constraints given -> query subdatasets under curdir
        if not path and dataset is None:
            path = os.curdir
        paths = [rev_resolve_path(p, dataset) for p in assure_list(path)] \
            if path else None

        ds = require_dataset(dataset,
                             check_installed=False,
                             purpose='subdataset reporting/modification')
        lgr.debug('Query subdatasets of %s', dataset)
        if paths is not None:
            lgr.debug('Query subdatasets underneath paths: %s', paths)
        refds_path = ds.path

        # XXX this seems strange, but is tested to be the case -- I'd rather set
        # `check_installed` to true above and fail
        if not GitRepo.is_valid_repo(refds_path):
            return

        # return as quickly as possible
        if isinstance(recursion_limit, int) and (recursion_limit <= 0):
            return

        if set_property:
            for k, v in set_property:
                if valid_key.match(k) is None:
                    raise ValueError(
                        "key '%s' is invalid (alphanumeric plus '-' only, must "
                        "start with a letter)" % k)
        if contains:
            contains = [
                rev_resolve_path(c, dataset) for c in assure_list(contains)
            ]
        for r in _get_submodules(ds, paths, fulfilled, recursive,
                                 recursion_limit, contains, bottomup,
                                 set_property, delete_property, refds_path):
            # a boat-load of ancient code consumes this and is ignorant of
            # Path objects
            r['path'] = text_type(r['path'])
            # without the refds_path cannot be rendered/converted relative
            # in the eval_results decorator
            r['refds'] = refds_path
            yield r
Beispiel #22
0
def test_install_dataset_from_just_source_via_path(url, path):
    # for remote urls only, the source could be given to `path`
    # to allows for simplistic cmdline calls
    # Q (ben): remote urls only? Sure? => TODO

    with chpwd(path, mkdir=True):
        ds = install(url)

    ok_startswith(ds.path, path)
    ok_(ds.is_installed())
    ok_(GitRepo.is_valid_repo(ds.path))
    ok_clean_git(ds.path, annex=None)
    assert_in('INFO.txt', ds.repo.get_indexed_files())
Beispiel #23
0
def test_install_dataset_from_instance(src=None, dst=None):
    origin = Dataset(src).create(result_renderer='disabled', force=True)
    origin.save(['INFO.txt', 'test.dat'], to_git=True)
    origin.save('test-annex.dat', to_git=False)

    clone = install(source=origin, path=dst)

    assert_is_instance(clone, Dataset)
    ok_startswith(clone.path, dst)
    ok_(clone.is_installed())
    ok_(GitRepo.is_valid_repo(clone.path))
    assert_repo_status(clone.path, annex=None)
    assert_in('INFO.txt', clone.repo.get_indexed_files())
Beispiel #24
0
def test_install_dataset_from_just_source_via_path(url, path):
    # for remote urls only, the source could be given to `path`
    # to allows for simplistic cmdline calls
    # Q (ben): remote urls only? Sure? => TODO

    with chpwd(path, mkdir=True):
        ds = install(url)

    ok_startswith(ds.path, path)
    ok_(ds.is_installed())
    ok_(GitRepo.is_valid_repo(ds.path))
    ok_clean_git(ds.path, annex=None)
    assert_in('INFO.txt', ds.repo.get_indexed_files())
Beispiel #25
0
    def __call__(
            path=None,
            dataset=None,
            fulfilled=None,
            recursive=False,
            recursion_limit=None,
            contains=None,
            bottomup=False,
            set_property=None,
            delete_property=None):
        # no constraints given -> query subdatasets under curdir
        if not path and dataset is None:
            path = os.curdir
        paths = [rev_resolve_path(p, dataset) for p in assure_list(path)] \
            if path else None

        ds = require_dataset(
            dataset, check_installed=False, purpose='subdataset reporting/modification')
        lgr.debug('Query subdatasets of %s', dataset)
        if paths is not None:
            lgr.debug('Query subdatasets underneath paths: %s', paths)
        refds_path = ds.path

        # XXX this seems strange, but is tested to be the case -- I'd rather set
        # `check_installed` to true above and fail
        if not GitRepo.is_valid_repo(refds_path):
            return

        # return as quickly as possible
        if isinstance(recursion_limit, int) and (recursion_limit <= 0):
            return

        if set_property:
            for k, v in set_property:
                if valid_key.match(k) is None:
                    raise ValueError(
                        "key '%s' is invalid (alphanumeric plus '-' only, must "
                        "start with a letter)" % k)
        if contains:
            contains = [rev_resolve_path(c, dataset) for c in assure_list(contains)]
        for r in _get_submodules(
                ds, paths, fulfilled, recursive, recursion_limit,
                contains, bottomup, set_property, delete_property,
                refds_path):
            # a boat-load of ancient code consumes this and is ignorant of
            # Path objects
            r['path'] = text_type(r['path'])
            # without the refds_path cannot be rendered/converted relative
            # in the eval_results decorator
            r['refds'] = refds_path
            yield r
Beispiel #26
0
def test_install_dataset_from_just_source_via_path(path=None):
    # for remote urls only, the source could be given to `path`
    # to allows for simplistic cmdline calls

    url = "https://github.com/datalad/testrepo--basic--r1.git"

    with chpwd(path, mkdir=True):
        ds = install(url)

    ok_startswith(ds.path, path)
    ok_(ds.is_installed())
    ok_(GitRepo.is_valid_repo(ds.path))
    assert_repo_status(ds.path, annex=None)
    assert_in('INFO.txt', ds.repo.get_indexed_files())
Beispiel #27
0
 def func(arg, top, names):
     refpath, ignore, dirs = arg
     legit_names = []
     for n in names:
         path = opj(top, n)
         if not isdir(path) or path in ignore:
             pass
         elif path != refpath and GitRepo.is_valid_repo(path):
             # mount point, keep but don't dive into
             dirs.append(path)
         else:
             legit_names.append(n)
             dirs.append(path)
     names[:] = legit_names
Beispiel #28
0
def _adj2subtrees(base, adj, subs):
    # given a set of parent-child mapping, compute a mapping of each parent
    # to all its (grand)children of any depth level
    subtrees = dict(adj)
    subs = set(subs)
    # from bottom up
    for ds in sorted(adj, reverse=True):
        subtree = []
        for sub in subtrees[ds]:
            subtree.append(sub)
            subtree.extend(subtrees.get(sub, []))
        subtrees[ds] = subtree
    # give each leaf dataset an entry too
    for sub in subs:
        if sub not in subtrees and GitRepo.is_valid_repo(sub):
            subtrees[sub] = []
    return subtrees
Beispiel #29
0
def _adj2subtrees(base, adj, subs):
    # given a set of parent-child mapping, compute a mapping of each parent
    # to all its (grand)children of any depth level
    subtrees = dict(adj)
    subs = set(subs)
    # from bottom up
    for ds in sorted(adj, reverse=True):
        subtree = []
        for sub in subtrees[ds]:
            subtree.append(sub)
            subtree.extend(subtrees.get(sub, []))
        subtrees[ds] = subtree
    # give each leaf dataset an entry too
    for sub in subs:
        if sub not in subtrees and GitRepo.is_valid_repo(sub):
            subtrees[sub] = []
    return subtrees
Beispiel #30
0
def download_dataset(repo=None,
                     remote_path=None,
                     local_folder=None,
                     update_if_exists=False,
                     unlock=False):
    assert HAVE_DATALAD, 'You need to install datalad'

    if repo is None:
        #  print('Use gin NeuralEnsemble/ephy_testing_data')
        repo = 'https://gin.g-node.org/NeuralEnsemble/ephy_testing_data'

    if local_folder is None:
        base_local_folder = get_global_dataset_folder()
        base_local_folder.mkdir(exist_ok=True)
        #  if not is_set_global_dataset_folder():
        #  print(f'Local folder is {base_local_folder}, Use set_global_dataset_folder() to set it globally')
        local_folder = base_local_folder / repo.split('/')[-1]

    if local_folder.exists() and GitRepo.is_valid_repo(local_folder):
        dataset = datalad.api.Dataset(path=local_folder)
        # make sure git repo is in clean state
        repo = dataset.repo
        if update_if_exists:
            repo.call_git(['checkout', '--force', 'master'])
            dataset.update(merge=True)
    else:
        dataset = datalad.api.install(path=local_folder, source=repo)

    if remote_path is None:
        print('Bad boy: you have to provide "remote_path"')
        return

    local_path = local_folder / remote_path

    dataset.get(remote_path)

    # unlocking is necessary for binding volume to containers
    if unlock:
        dataset.unlock(remote_path, recursive=True)

    return local_path
Beispiel #31
0
def test_clone_simple_local(src, path):
    origin = Dataset(path)

    # now install it somewhere else
    ds = clone(src,
               path,
               description='mydummy',
               result_xfm='datasets',
               return_type='item-or-list')
    eq_(ds.path, path)
    ok_(ds.is_installed())
    if not isinstance(origin.repo, AnnexRepo):
        # this means it is a GitRepo
        ok_(isinstance(origin.repo, GitRepo))
        # stays plain Git repo
        ok_(isinstance(ds.repo, GitRepo))
        ok_(not isinstance(ds.repo, AnnexRepo))
        ok_(GitRepo.is_valid_repo(ds.path))
        eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt'})
        assert_repo_status(path, annex=False)
    else:
        # must be an annex
        ok_(isinstance(ds.repo, AnnexRepo))
        ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False))
        eq_(set(ds.repo.get_indexed_files()),
            {'test.dat', 'INFO.txt', 'test-annex.dat'})
        assert_repo_status(path, annex=True)
        # no content was installed:
        ok_(not ds.repo.file_has_content('test-annex.dat'))
        uuid_before = ds.repo.uuid
        eq_(ds.repo.get_description(), 'mydummy')

    # installing it again, shouldn't matter:
    res = clone(src, path, result_xfm=None, return_type='list')
    assert_result_values_equal(res, 'source_url', [src])
    assert_status('notneeded', res)
    assert_message("dataset %s was already cloned from '%s'", res)
    ok_(ds.is_installed())
    if isinstance(origin.repo, AnnexRepo):
        eq_(uuid_before, ds.repo.uuid)
Beispiel #32
0
def _discover_subdatasets_recursively(top, trace, spec, recursion_limit):
    # this beast walks the directory tree from a give `top` directory
    # and discovers valid repos that are scattered around, regardless
    # of whether they are already subdatasets or not
    # for all found datasets it puts an entry into the SPEC and also
    # and entry with the path in the SPEC of the parent dataset
    if recursion_limit is not None and len(trace) > recursion_limit:
        return
    if not isdir(top):
        return
    if GitRepo.is_valid_repo(top):
        # found a repo, add the entire thing
        spec[top] = spec.get(top, []) + [top]
        # and to the parent
        if trace:
            spec[trace[-1]] = spec.get(trace[-1], []) + [top]
        trace = trace + [top]
    for path in listdir(top):
        path = opj(top, path)
        if not isdir(path):
            continue
        _discover_subdatasets_recursively(path, trace, spec, recursion_limit)
Beispiel #33
0
def _discover_subdatasets_recursively(top, trace, spec, recursion_limit):
    # this beast walks the directory tree from a give `top` directory
    # and discovers valid repos that are scattered around, regardless
    # of whether they are already subdatasets or not
    # for all found datasets it puts an entry into the SPEC and also
    # and entry with the path in the SPEC of the parent dataset
    if recursion_limit is not None and len(trace) > recursion_limit:
        return
    if not isdir(top):
        return
    if GitRepo.is_valid_repo(top):
        # found a repo, add the entire thing
        spec[top] = spec.get(top, []) + [top]
        # and to the parent
        if trace:
            spec[trace[-1]] = spec.get(trace[-1], []) + [top]
        trace = trace + [top]
    for path in listdir(top):
        path = opj(top, path)
        if not isdir(path):
            continue
        _discover_subdatasets_recursively(path, trace, spec, recursion_limit)
Beispiel #34
0
def _discover_trace_to_known(path, trace, spec):
    # this beast walks the directory tree from a given `path` until
    # it discoveres a known dataset (i.e. recorded in the spec)
    # if it finds one, it commits any accummulated trace of visited
    # datasets on this edge to the spec
    valid_repo = GitRepo.is_valid_repo(path)
    if valid_repo:
        trace = trace + [path]
        if path in spec:
            # found a known repo, commit the trace
            for i, p in enumerate(trace[:-1]):
                spec[p] = list(set(spec.get(p, []) + [trace[i + 1]]))
            # this edge is not done, we need to try to reach any downstream
            # dataset
    for p in listdir(path):
        if valid_repo and p == '.git':
            # ignore gitdir to steed things up
            continue
        p = opj(path, p)
        if not isdir(p):
            continue
        _discover_trace_to_known(p, trace, spec)
Beispiel #35
0
    def __call__(
            dataset=None,
            fulfilled=None,
            recursive=False,
            recursion_limit=None,
            contains=None,
            bottomup=False,
            set_property=None,
            delete_property=None):
        dataset = require_dataset(
            dataset, check_installed=False, purpose='subdataset reporting/modification')
        refds_path = dataset.path

        # XXX this seems strange, but is tested to be the case -- I'd rather set
        # `check_installed` to true above and fail
        if not GitRepo.is_valid_repo(refds_path):
            return

        # return as quickly as possible
        if isinstance(recursion_limit, int) and (recursion_limit <= 0):
            return

        if set_property:
            for k, v in set_property:
                if valid_key.match(k) is None:
                    raise ValueError(
                        "key '%s' is invalid (alphanumeric plus '-' only, must start with a letter)",
                        k)
        if contains:
            contains = resolve_path(contains, dataset)
        for r in _get_submodules(
                dataset.path, fulfilled, recursive, recursion_limit,
                contains, bottomup, set_property, delete_property,
                refds_path):
            # without the refds_path cannot be rendered/converted relative
            # in the eval_results decorator
            r['refds'] = refds_path
            yield r
Beispiel #36
0
    def __call__(
            dataset=None,
            fulfilled=None,
            recursive=False,
            recursion_limit=None,
            contains=None,
            bottomup=False,
            set_property=None,
            delete_property=None):
        dataset = require_dataset(
            dataset, check_installed=False, purpose='subdataset reporting/modification')
        refds_path = dataset.path

        # XXX this seems strange, but is tested to be the case -- I'd rather set
        # `check_installed` to true above and fail
        if not GitRepo.is_valid_repo(refds_path):
            return

        # return as quickly as possible
        if isinstance(recursion_limit, int) and (recursion_limit <= 0):
            return

        if set_property:
            for k, v in set_property:
                if valid_key.match(k) is None:
                    raise ValueError(
                        "key '%s' is invalid (alphanumeric plus '-' only, must start with a letter)",
                        k)
        if contains:
            contains = resolve_path(contains, dataset)
        for r in _get_submodules(
                dataset.path, fulfilled, recursive, recursion_limit,
                contains, bottomup, set_property, delete_property,
                refds_path):
            # without the refds_path cannot be rendered/converted relative
            # in the eval_results decorator
            r['refds'] = refds_path
            yield r
Beispiel #37
0
def _parse_git_submodules(dspath):
    """All known ones with some properties"""
    if not exists(opj(dspath, ".gitmodules")):
        # easy way out. if there is no .gitmodules file
        # we cannot have (functional) subdatasets
        return

    # this will not work in direct mode, need better way #1422
    cmd = ['git', 'ls-files', '--stage', '-z']

    # need to go rogue  and cannot use proper helper in GitRepo
    # as they also pull in all of GitPython's magic
    try:
        stdout, stderr = GitRunner(cwd=dspath).run(
            cmd,
            log_stderr=True,
            log_stdout=True,
            # not sure why exactly, but log_online has to be false!
            log_online=False,
            expect_stderr=False,
            shell=False,
            # we don't want it to scream on stdout
            expect_fail=True)
    except CommandError as e:
        raise InvalidGitRepositoryError(exc_str(e))

    for line in stdout.split('\0'):
        if not line or not line.startswith('160000'):
            continue
        sm = {}
        props = submodule_full_props.match(line)
        sm['revision'] = props.group(2)
        subpath = _path_(dspath, props.group(4))
        sm['path'] = subpath
        if not exists(subpath) or not GitRepo.is_valid_repo(subpath):
            sm['state'] = 'absent'
        yield sm
Beispiel #38
0
def _parse_git_submodules(dspath):
    """All known ones with some properties"""
    if not exists(opj(dspath, ".gitmodules")):
        # easy way out. if there is no .gitmodules file
        # we cannot have (functional) subdatasets
        return

    # this will not work in direct mode, need better way #1422
    cmd = ['git', 'ls-files', '--stage', '-z']

    # need to go rogue  and cannot use proper helper in GitRepo
    # as they also pull in all of GitPython's magic
    try:
        stdout, stderr = GitRunner(cwd=dspath).run(
            cmd,
            log_stderr=True,
            log_stdout=True,
            # not sure why exactly, but log_online has to be false!
            log_online=False,
            expect_stderr=False,
            shell=False,
            # we don't want it to scream on stdout
            expect_fail=True)
    except CommandError as e:
        raise InvalidGitRepositoryError(exc_str(e))

    for line in stdout.split('\0'):
        if not line or not line.startswith('160000'):
            continue
        sm = {}
        props = submodule_full_props.match(line)
        sm['revision'] = props.group(2)
        subpath = _path_(dspath, props.group(4))
        sm['path'] = subpath
        if not exists(subpath) or not GitRepo.is_valid_repo(subpath):
            sm['state'] = 'absent'
        yield sm
Beispiel #39
0
    def is_installed(self):
        """Returns whether a dataset is installed.

        A dataset is installed when a repository for it exists on the filesystem.

        Returns
        -------
        bool
        """
        # do early check manually if path exists to not even ask git at all
        exists_now = exists(self.path)

        was_once_installed = None
        if exists_now:
            was_once_installed = self.path is not None and \
                                 self.repo is not None

        if not exists_now or \
                (was_once_installed and not GitRepo.is_valid_repo(self.path)):
            # repo gone now, reset
            self._repo = None
            return False
        else:
            return was_once_installed
Beispiel #40
0
def test_install_simple_local(src, path):
    origin = Dataset(path)

    # now install it somewhere else
    ds = install(path, source=src)
    eq_(ds.path, path)
    ok_(ds.is_installed())
    if not isinstance(origin.repo, AnnexRepo):
        # this means it is a GitRepo
        ok_(isinstance(origin.repo, GitRepo))
        # stays plain Git repo
        ok_(isinstance(ds.repo, GitRepo))
        ok_(not isinstance(ds.repo, AnnexRepo))
        ok_(GitRepo.is_valid_repo(ds.path))
        eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt'})
        ok_clean_git(path, annex=False)
    else:
        # must be an annex
        ok_(isinstance(ds.repo, AnnexRepo))
        ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False))
        eq_(set(ds.repo.get_indexed_files()),
            {'test.dat', 'INFO.txt', 'test-annex.dat'})
        ok_clean_git(path, annex=True)
        # no content was installed:
        ok_(not ds.repo.file_has_content('test-annex.dat'))
        uuid_before = ds.repo.uuid

    # installing it again, shouldn't matter:
    with swallow_logs(new_level=logging.INFO) as cml:
        ds = install(path, source=src)
        cml.assert_logged(msg="{0} was already installed from".format(ds),
                          regex=False,
                          level="INFO")
        ok_(ds.is_installed())
        if isinstance(origin.repo, AnnexRepo):
            eq_(uuid_before, ds.repo.uuid)
Beispiel #41
0
def test_install_simple_local(src, path):
    origin = Dataset(path)

    # now install it somewhere else
    ds = install(path, source=src)
    eq_(ds.path, path)
    ok_(ds.is_installed())
    if not isinstance(origin.repo, AnnexRepo):
        # this means it is a GitRepo
        ok_(isinstance(origin.repo, GitRepo))
        # stays plain Git repo
        ok_(isinstance(ds.repo, GitRepo))
        ok_(not isinstance(ds.repo, AnnexRepo))
        ok_(GitRepo.is_valid_repo(ds.path))
        eq_(set(ds.repo.get_indexed_files()),
            {'test.dat', 'INFO.txt'})
        ok_clean_git(path, annex=False)
    else:
        # must be an annex
        ok_(isinstance(ds.repo, AnnexRepo))
        ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False))
        eq_(set(ds.repo.get_indexed_files()),
            {'test.dat', 'INFO.txt', 'test-annex.dat'})
        ok_clean_git(path, annex=True)
        # no content was installed:
        ok_(not ds.repo.file_has_content('test-annex.dat'))
        uuid_before = ds.repo.uuid

    # installing it again, shouldn't matter:
    with swallow_logs(new_level=logging.INFO) as cml:
        ds = install(path, source=src)
        cml.assert_logged(msg="{0} was already installed from".format(ds),
                          regex=False, level="INFO")
        ok_(ds.is_installed())
        if isinstance(origin.repo, AnnexRepo):
            eq_(uuid_before, ds.repo.uuid)
Beispiel #42
0
    def __call__(
            path=None,
            dataset=None,
            to_git=False,
            save=True,
            recursive=False,
            recursion_limit=None,
            ds2super=False,
            git_opts=None,
            annex_opts=None,
            annex_add_opts=None,
            jobs=None):

        # parameter constraints:
        if not path:
            raise InsufficientArgumentsError(
                "insufficient information for adding: requires at least a path")
        # never recursion, need to handle manually below to be able to
        # discover untracked content
        content_by_ds, unavailable_paths = Interface._prep(
            path=path,
            dataset=dataset,
            recursive=False)
        if unavailable_paths:
            lgr.warning("ignoring non-existent path(s): %s",
                        unavailable_paths)
        if recursive:
            # with --recursive for each input path traverse the directory
            # tree, when we find a dataset, add it to the spec, AND add it as
            # a path to the spec of the parent
            # MIH: wrap in list() to avoid exception, because dict size might
            # change, but we want to loop over all that are in at the start
            # only
            for d in list(content_by_ds.keys()):
                for p in content_by_ds[d]:
                    _discover_subdatasets_recursively(
                        p,
                        [d],
                        content_by_ds,
                        recursion_limit)

        if not content_by_ds:
            raise InsufficientArgumentsError(
                "no existing content given to add")

        if dataset:
            # remeber the datasets associated with actual inputs
            input_ds = list(content_by_ds.keys())
            # forge chain from base dataset to any leaf dataset
            _discover_trace_to_known(dataset.path, [], content_by_ds)
            if ds2super:
                # now check all dataset entries corresponding to the original
                # input to see if they contain their own paths and remove them
                for inpds in input_ds:
                    content_by_ds[inpds] = [p for p in content_by_ds[inpds]
                                            if not p == inpds]
                # and lastly remove all entries that contain no path to avoid
                # saving any staged content in the final step
                content_by_ds = {d: v for d, v in content_by_ds.items() if v}

        results = []
        # simple loop over datasets -- save happens later
        # start deep down
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            toadd = list(set(content_by_ds[ds_path]))
            # handle anything that looks like a wannabe subdataset
            for subds_path in [d for d in toadd
                               if GitRepo.is_valid_repo(d) and
                               d != ds_path and
                               d not in ds.get_subdatasets(
                                   recursive=False,
                                   absolute=True,
                                   fulfilled=True)]:
                # TODO add check that the subds has a commit, and refuse
                # to operate on it otherwise, or we would get a bastard
                # submodule that cripples git operations
                _install_subds_inplace(
                    ds=ds,
                    path=subds_path,
                    relativepath=relpath(subds_path, ds_path))
                # make sure that .gitmodules is added to the list of files
                toadd.append(opj(ds.path, '.gitmodules'))
                # report added subdatasets -- add below won't do it
                results.append({
                    'success': True,
                    'file': Dataset(subds_path)})
            # make sure any last minute additions make it to the saving stage
            content_by_ds[ds_path] = toadd
            added = ds.repo.add(
                toadd,
                git=to_git if isinstance(ds.repo, AnnexRepo) else True,
                commit=False)
            for a in added:
                a['file'] = opj(ds_path, a['file'])
            results.extend(added)

        if results and save:
            save_dataset_hierarchy(
                content_by_ds,
                base=dataset.path if dataset and dataset.is_installed() else None,
                message='[DATALAD] added content')

        return results
Beispiel #43
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            check=True,
            if_dirty='save-before'):

        if not dataset and not path:
            raise InsufficientArgumentsError(
                "insufficient information for `drop`: requires at least a path or dataset")
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='drop', logger=lgr, refds=refds_path)
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path
        to_drop = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='drop',
                # justification for status:
                # content need not be dropped where there is none
                unavailable_path_status='notneeded',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) == 'dataset' and \
                    GitRepo.is_valid_repo(ap['path']) and \
                    not ap['path'] == refds_path:
                ap['process_content'] = True
            if ap.get('registered_subds', False) and ap.get('state', None) == 'absent':
                # nothing to drop in an absent subdataset, don't be annoying
                # and skip silently
                continue
            to_drop.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_drop,
                refds_path=refds_path)
        assert(not completed)

        # iterate over all datasets, order doesn't matter
        for ds_path in content_by_ds:
            ds = Dataset(ds_path)
            # TODO generator
            # this should yield what it did
            handle_dirty_dataset(ds, mode=if_dirty)
            # ignore submodule entries
            content = [ap['path'] for ap in content_by_ds[ds_path]
                       if ap.get('type', None) != 'dataset' or ap['path'] == ds.path]
            if not content:
                continue
            for r in _drop_files(ds, content, check=check, **res_kwargs):
                yield r
Beispiel #44
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            check=True,
            save=True,
            message=None,
            if_dirty='save-before'):
        res_kwargs = dict(action='remove', logger=lgr)
        if not dataset and not path:
            raise InsufficientArgumentsError(
                "insufficient information for `remove`: requires at least a path or dataset")
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs['refds'] = refds_path
        if refds_path and not path and not GitRepo.is_valid_repo(refds_path):
            # nothing here, nothing to remove
            yield get_status_dict(path=refds_path, status='notneeded', **res_kwargs)
            return
        if refds_path and not path:
            # act on the whole dataset if nothing else was specified
            # TODO i think that would happen automatically in annotation?
            path = refds_path

        to_process = []

        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                # we only ever want to discover immediate subdatasets, the rest
                # will happen in `uninstall`
                recursion_limit=1,
                action='remove',
                unavailable_path_status='',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('state', None) == 'absent' and \
                    ap.get('parentds', None) is None:
                # nothing exists at location, and there is no parent to
                # remove from
                ap['status'] = 'notneeded'
                ap['message'] = "path does not exist and is not in a dataset"
                yield ap
                continue
            if ap.get('raw_input', False) and ap.get('type', None) == 'dataset':
                # make sure dataset sorting yields a dedicted entry for this one
                ap['process_content'] = True
            to_process.append(ap)

        if not to_process:
            # nothing left to do, potentially all errored before
            return

        if path_is_under([ap['path'] for ap in to_process]):
            # behave like `rm` and refuse to remove where we are
            raise ValueError(
                "refusing to uninstall current or parent directory")

        # now sort into datasets so we can process them one by one
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path)
        assert(not completed)

        # iterate over all datasets, starting at the bottom
        # to make the removal of dataset content known upstairs
        to_save = []
        # track which submodules we have removed in the process, to avoid
        # failure in case we revisit them due to a subsequent path argument
        subm_removed = []
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            paths = content_by_ds[ds_path]
            to_reporemove = dict()
            # PLAN any dataset that was not raw_input, uninstall (passing recursive flag)
            # if dataset itself is in paths, skip any nondataset
            # sort reverse so we get subdatasets first
            for ap in sorted(paths, key=lambda x: x['path'], reverse=True):
                if ap.get('type', None) == 'dataset':
                    # entire dataset needs to go, uninstall if present, pass recursive!
                    uninstall_failed = False
                    if ap['path'] == refds_path or \
                            (refds_path is None and ap.get('raw_input', False)):
                        # top-level handling, cannot use regular uninstall call, as
                        # it will refuse to uninstall a top-level dataset
                        # and rightfully so, it is really a remove in that case
                        # bypass all the safety by using low-level helper
                        for r in _uninstall_dataset(ds, check=check, has_super=False,
                                                    **res_kwargs):
                            if r['status'] in ('impossible', 'error'):
                                # we need to inspect if something went wrong, in order
                                # to prevent failure from removing a non-empty dir below,
                                # but at the same time allow for continued processing
                                uninstall_failed = True
                            r['refds'] = refds_path
                            yield r
                    # recheck that it wasn't removed during a previous iteration
                    elif ap.get('state', None) != 'absent' and GitRepo.is_valid_repo(ap['path']):
                        # anything that is not the top-level -> regular uninstall
                        # this is for subdatasets of the to-be-removed dataset
                        # we want to simply uninstall them in a regular manner
                        for r in Uninstall.__call__(
                                # use annotate path as input, but pass a copy because
                                # we cannot rely on it being unaltered by reannotation
                                # TODO maybe adjust annotate_path to do that
                                [ap.copy()],
                                dataset=refds_path, recursive=recursive, check=check,
                                if_dirty=if_dirty, result_xfm=None, result_filter=None,
                                on_failure='ignore'):
                            if r['status'] in ('impossible', 'error'):
                                # we need to inspect if something went wrong, in order
                                # to prevent failure from removing a non-empty dir below,
                                # but at the same time allow for continued processing
                                uninstall_failed = True
                            yield r
                    if not ap.get('raw_input', False):
                        # we only ever want to actually unregister subdatasets that
                        # were given explicitly
                        continue
                    if not uninstall_failed and \
                            not ap['path'] in subm_removed and \
                            refds_path and \
                            ap.get('parentds', None) and \
                            not (relpath(ap['path'], start=refds_path).startswith(pardir) or
                                 ap['path'] == refds_path) and \
                            ap.get('registered_subds', False):
                        # strip from superdataset, but only if a dataset was given explcitly
                        # as in "remove from this dataset", but not when just a path was given
                        # as in "remove from the filesystem"
                        subds_relpath = relpath(ap['path'], start=ap['parentds'])
                        # remove submodule reference
                        parentds = Dataset(ap['parentds'])
                        # play safe, will fail on dirty
                        parentds.repo.deinit_submodule(ap['path'])
                        # remove now empty submodule link
                        parentds.repo.remove(ap['path'])
                        # make a record that we removed this already, should it be
                        # revisited via another path argument, because do not reannotate
                        # the paths after every removal
                        subm_removed.append(ap['path'])
                        yield dict(ap, status='ok', **res_kwargs)
                        # need .gitmodules update in parent
                        to_save.append(dict(
                            path=opj(parentds.path, '.gitmodules'),
                            parents=parentds.path,
                            type='file'))
                        # and the removal itself needs to be committed
                        # inform `save` that it is OK that this path
                        # doesn't exist on the filesystem anymore
                        ap['unavailable_path_status'] = ''
                        ap['process_content'] = False
                        to_save.append(ap)
                    if not uninstall_failed and exists(ap['path']):
                        # could be an empty dir in case an already uninstalled subdataset
                        # got removed
                        rmdir(ap['path'])
                else:
                    # anything that is not a dataset can simply be passed on
                    to_reporemove[ap['path']] = ap
            # avoid unnecessary git calls when there is nothing to do
            if to_reporemove:
                if check and hasattr(ds.repo, 'drop'):
                    for r in _drop_files(ds, list(to_reporemove),
                                         check=True):
                        if r['status'] == 'error':
                            # if drop errored on that path, we can't remove it
                            to_reporemove.pop(r['path'], 'avoidKeyError')
                        yield r

                if to_reporemove:
                    for r in ds.repo.remove(list(to_reporemove), r=True):
                        # these were removed, but we still need to save the
                        # removal

                        r_abs = opj(ds.path, r)
                        if r_abs in to_reporemove:
                            ap = to_reporemove[r_abs]
                        else:
                            ap = {'path': r_abs,
                                  'parentds': ds.path,
                                  'refds': refds_path
                                  }
                        ap['unavailable_path_status'] = ''
                        to_save.append(ap)
                        yield get_status_dict(
                            status='ok',
                            path=r,
                            **res_kwargs)

        if not to_save:
            # nothing left to do, potentially all errored before
            return
        if not save:
            lgr.debug('Not calling `save` as instructed')
            return

        for res in Save.__call__(
                # TODO compose hand-selected annotated paths
                path=to_save,
                # we might have removed the reference dataset by now, recheck
                dataset=refds_path
                        if (refds_path and GitRepo.is_valid_repo(refds_path))
                        else None,
                message=message if message else '[DATALAD] removed content',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Beispiel #45
0
    def __call__(
            path=None,
            dataset=None,
            get_aggregates=False,
            reporton='all',
            recursive=False):
        # prep results
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='metadata', logger=lgr)
        if refds_path:
            res_kwargs['refds'] = refds_path

        if get_aggregates:
            # yield all datasets for which we have aggregated metadata as results
            # the get actual dataset results, so we can turn them into dataset
            # instances using generic top-level code if desired
            ds = require_dataset(
                refds_path,
                check_installed=True,
                purpose='aggregate metadata query')
            agginfos = load_ds_aggregate_db(
                ds,
                version=str(aggregate_layout_version),
                abspath=True
            )
            if not agginfos:
                # if there has ever been an aggregation run, this file would
                # exist, hence there has not been and we need to tell this
                # to people
                yield get_status_dict(
                    ds=ds,
                    status='impossible',
                    action='metadata',
                    logger=lgr,
                    message='metadata aggregation has never been performed in this dataset')
                return
            parentds = []
            for dspath in sorted(agginfos):
                info = agginfos[dspath]
                if parentds and not path_is_subpath(dspath, parentds[-1]):
                    parentds.pop()
                info.update(
                    path=dspath,
                    type='dataset',
                    status='ok',
                )
                if dspath == ds.path:
                    info['layout_version'] = aggregate_layout_version
                if parentds:
                    info['parentds'] = parentds[-1]
                yield dict(
                    info,
                    **res_kwargs
                )
                parentds.append(dspath)
            return

        if not dataset and not path:
            # makes no sense to have no dataset, go with "here"
            # error generation happens during annotation
            path = op.curdir

        content_by_ds = OrderedDict()
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                # MIH: we are querying the aggregated metadata anyways, and that
                # mechanism has its own, faster way to go down the hierarchy
                #recursive=recursive,
                #recursion_limit=recursion_limit,
                action='metadata',
                # uninstalled subdatasets could be queried via aggregated metadata
                # -> no 'error'
                unavailable_path_status='',
                nondataset_path_status='error',
                # we need to know when to look into aggregated data
                force_subds_discovery=True,
                force_parentds_discovery=True,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo(ap['path']):
                ap['process_content'] = True
            to_query = None
            if ap.get('state', None) == 'absent' or \
                    ap.get('type', 'dataset') != 'dataset':
                # this is a lonely absent dataset/file or content in a present dataset
                # -> query through parent
                # there must be a parent, otherwise this would be a non-dataset path
                # and would have errored during annotation
                to_query = ap['parentds']
            else:
                to_query = ap['path']
            if to_query:
                pcontent = content_by_ds.get(to_query, [])
                pcontent.append(ap)
                content_by_ds[to_query] = pcontent

        for ds_path in content_by_ds:
            ds = Dataset(ds_path)
            query_agg = [ap for ap in content_by_ds[ds_path]
                         # this is an available subdataset, will be processed in another
                         # iteration
                         if ap.get('state', None) == 'absent' or
                         not(ap.get('type', None) == 'dataset' and ap['path'] != ds_path)]
            if not query_agg:
                continue
            # report from aggregated metadata
            for r in query_aggregated_metadata(
                    reporton,
                    # by default query the reference dataset, only if there is none
                    # try our luck in the dataset that contains the queried path
                    # this is consistent with e.g. `get_aggregates` reporting the
                    # situation in the reference dataset only
                    Dataset(refds_path) if refds_path else ds,
                    query_agg,
                    # recursion above could only recurse into datasets
                    # on the filesystem, but there might be any number of
                    # uninstalled datasets underneath the last installed one
                    # for which we might have metadata
                    recursive=recursive,
                    **res_kwargs):
                yield r
        return
Beispiel #46
0
def discover_dataset_trace_to_targets(basepath, targetpaths, current_trace,
                                      spec, includeds=None):
    """Discover the edges and nodes in a dataset tree to given target paths

    Parameters
    ----------
    basepath : path
      Path to a start or top-level dataset. Really has to be a path to a
      dataset!
    targetpaths : list(path)
      Any non-zero number of paths that are termination points for the
      search algorithm. Can be paths to datasets, directories, or files
      (and any combination thereof).
    current_trace : list
      For a top-level call this should probably always be `[]`
    spec : dict
      `content_by_ds`-style dictionary that will receive information about the
      discovered datasets. Specifically, for each discovered dataset there
      will be in item with its path under the key (path) of the respective
      superdataset.
    includeds : sequence, optional
      Any paths given are treated as existing subdatasets, regardless of
      whether they can be found in the filesystem. Such subdatasets will appear
      under the key of the closest existing dataset in the `spec`.

    Returns
    -------
    None
      Function calls itself recursively and populates `spec` dict in-place.
      Keys are dataset paths, values are sets of subdataset paths
    """
    # convert to set for faster lookup
    includeds = includeds if isinstance(includeds, set) else \
        set() if includeds is None else set(includeds)
    # this beast walks the directory tree from a given `basepath` until
    # it discovers any of the given `targetpaths`
    # if it finds one, it commits any accummulated trace of visited
    # datasets on this edge to the spec
    valid_repo = GitRepo.is_valid_repo(basepath)
    if valid_repo:
        # we are passing into a new dataset, extend the dataset trace
        current_trace = current_trace + [basepath]
    # this edge is not done, we need to try to reach any downstream
    # dataset
    undiscovered_ds = set(t for t in targetpaths) # if t != basepath)
    # whether anything in this directory matched a targetpath
    filematch = False
    if isdir(basepath):
        for p in listdir(basepath):
            p = assure_unicode(opj(basepath, p))
            if not isdir(p):
                if p in targetpaths:
                    filematch = True
                # we cannot have anything below this one
                continue
            # OPT listdir might be large and we could have only few items
            # in `targetpaths` -- so traverse only those in spec which have
            # leading dir basepath
            # filter targets matching this downward path
            downward_targets = set(
                t for t in targetpaths if path_startswith(t, p))
            if not downward_targets:
                continue
            # remove the matching ones from the "todo" list
            undiscovered_ds.difference_update(downward_targets)
            # go one deeper
            discover_dataset_trace_to_targets(
                p, downward_targets, current_trace, spec,
                includeds=includeds if not includeds else includeds.intersection(
                    downward_targets))
    undiscovered_ds = [t for t in undiscovered_ds
                       if includeds and
                          path_is_subpath(t, current_trace[-1]) and
                          t in includeds]
    if filematch or basepath in targetpaths or undiscovered_ds:
        for i, p in enumerate(current_trace[:-1]):
            # TODO RF prepare proper annotated path dicts
            subds = spec.get(p, set())
            subds.add(current_trace[i + 1])
            spec[p] = subds
        if undiscovered_ds:
            spec[current_trace[-1]] = spec.get(current_trace[-1], set()).union(
                undiscovered_ds)
Beispiel #47
0
    def __call__(
            path=None,
            source=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            get_data=True,
            description=None,
            reckless=False,
            #git_opts=None,
            #annex_opts=None,
            #annex_get_opts=None,
            jobs='auto',
            verbose=False,
    ):
        # IMPLEMENTATION CONCEPT:
        #
        # 1. Sort the world into existing handles and the rest
        # 2. Try locate missing handles (obtain subdatasets along the way)
        # 3. Expand into subdatasets with recursion enables (potentially
        #    obtain even more subdatasets
        # 4. Shoot info of which handles to get in each subdataset to,
        #    git-annex, once at the very end

        refds_path = Interface.get_refds_path(dataset)
        if not (dataset or path):
            raise InsufficientArgumentsError(
                "Neither dataset nor target path(s) provided")
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path

        # remember which results we already reported, to avoid duplicates
        yielded_ds = []
        to_get = []
        unavailable_paths = []
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='get',
                # NOTE: Do not act upon unavailable paths yet! Done below after
                # testing which ones could be obtained
                unavailable_path_status='',
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # we know what to report already
                yield ap
                continue
            if ap.get('state', None) == 'absent' and ap.get('raw_input', False):
                # if this wasn't found, but directly requested, queue for further
                # exploration
                unavailable_paths.append(ap)
                continue
            if ap.get('type', None) == 'dataset' and \
                    GitRepo.is_valid_repo(ap['path']) and \
                    not ap['path'] == refds_path:
                # do not report what hasn't arived yet
                # also do not report the base dataset that is already
                # present -- no surprise
                yield dict(ap, status='notneeded', logger=lgr,
                           message='already installed')
                yielded_ds.append(ap['path'])
                ap['process_content'] = get_data
            to_get.append(ap)

        # explore the unknown
        for ap in sorted(unavailable_paths, key=lambda x: x['path']):
            lgr.debug("Investigate yet unavailable path %s", ap)
            # how close can we get?
            dspath = ap.get('parentds', get_dataset_root(ap['path']))
            if dspath is None:
                # nothing we can do for this path
                continue
            lgr.debug("Found containing dataset %s for path %s", dspath, ap['path'])
            ds = Dataset(dspath)
            # now actually obtain whatever is necessary to get to this path
            containing_ds = [dspath]
            for res in _install_necessary_subdatasets(
                    ds, ap['path'], reckless, refds_path, description=description):
                # yield immediately so errors could be acted upon outside, before
                # we continue
                if not (res['type'] == 'dataset' and res['path'] in yielded_ds):
                    # unless we reported on this dataset before
                    if res['type'] == 'dataset':
                        # make a record, recursive below might now want to report
                        # a 'notneeded'
                        yielded_ds.append(res['path'])
                    yield res
                # update to the current innermost dataset
                containing_ds.append(res['path'])

            if len(containing_ds) < 2:
                # no subdataset was installed, hence if the path was unavailable
                # before it still is, no need to bother git annex
                ap.update(status='impossible',
                          message='path does not exist')
                yield ap
                continue
            # important to only do the next for the innermost subdataset
            # as the `recursive` logic below relies on that!
            # set the correct parent, for a dataset this would be the second-last
            # reported subdataset
            ap.update(parentds=containing_ds[-1])
            if containing_ds[-1] == ap['path']:
                # the path actually refers to the last installed dataset
                ap.update(parentds=containing_ds[-2],
                          process_content=get_data,
                          type='dataset')
            to_get.append(ap)

        # results of recursive installation of yet undiscovered datasets
        rec_get = []
        if recursive and not recursion_limit == 'existing':
            # obtain any subdatasets underneath the paths given inside the
            # subdatasets that we know already exist
            # unless we do not want recursion into not-yet-installed datasets
            for ap in sorted(to_get, key=lambda x: x['path']):
                if ap['type'] not in ('dataset', 'directory') or not ap.get('raw_input', False):
                    # a non-directory cannot have content underneath
                    # also we do NOT want to recurse into anything that was specifically
                    # requested, to avoid duplication
                    continue
                subds = Dataset(ap['path'] if ap['type'] == 'dataset' else ap['parentds'])
                lgr.info(
                    "Installing %s%s recursively",
                    subds,
                    (" underneath %s" % ap['path']
                     if subds.path != ap['path']
                     else ""))
                for res in _recursive_install_subds_underneath(
                        subds,
                        # `ap['path']` was explicitly given as input
                        # we count recursions from the input, hence we
                        # can start with the full number
                        recursion_limit,
                        reckless,
                        start=ap['path'],
                        refds_path=refds_path,
                        description=description):
                    # yield immediately so errors could be acted upon
                    # outside, before we continue
                    if not (res['type'] == 'dataset' and res['path'] in yielded_ds):
                        # unless we reported on this dataset before
                        if res['type'] == 'dataset':
                            # make a record
                            yielded_ds.append(res['path'])
                    yield res
                    if not (res['status'] == 'ok' and res['type'] == 'dataset'):
                        # not a dataset that was just installed, we just reported it
                        # upstairs, and can ignore it from now on
                        continue
                    # paranoia, so popular these days...
                    assert GitRepo.is_valid_repo(res['path'])
                    # keep a copy of the install record for `get` later on
                    get_ap = {k: v for k, v in res.items()
                              if not k == 'status'}
                    get_ap['process_content'] = get_data
                    rec_get.append(get_ap)

        if not get_data:
            # done already
            return

        # merge the two AP lists
        to_get.extend(rec_get)

        # sort into datasets
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_get,
                refds_path=refds_path)
        assert(not completed)

        # hand over to git-annex, get files content,
        # report files in git as 'notneeded' to get
        for ds_path in sorted(content_by_ds.keys()):
            ds = Dataset(ds_path)
            # grab content, ignore subdataset entries
            content = [ap['path'] for ap in content_by_ds[ds_path]
                       if ap.get('type', None) != 'dataset' or ap['path'] == ds.path]
            if not content:
                # cut this short should there be nothing
                continue
            # needs to be an annex to get content
            if not isinstance(ds.repo, AnnexRepo):
                for r in results_from_paths(
                        content, status='notneeded',
                        message="no dataset annex, content already present",
                        action='get', logger=lgr,
                        refds=refds_path):
                    yield r
                continue
            respath_by_status = {}
            for res in ds.repo.get(
                    content,
                    options=['--from=%s' % source] if source else [],
                    jobs=jobs):
                res = annexjson2result(res, ds, type='file', logger=lgr,
                                       refds=refds_path)
                success = success_status_map[res['status']]
                # TODO: in case of some failed commands (e.g. get) there might
                # be no path in the record.  yoh has only vague idea of logic
                # here so just checks for having 'path', but according to
                # results_from_annex_noinfo, then it would be assumed that
                # `content` was acquired successfully, which is not the case
                if 'path' in res:
                    respath_by_status[success] = \
                        respath_by_status.get(success, []) + [res['path']]
                yield res

            for r in results_from_annex_noinfo(
                    ds,
                    content,
                    respath_by_status,
                    dir_fail_msg='could not get some content in %s %s',
                    noinfo_dir_msg='nothing to get from %s',
                    noinfo_file_msg='already present',
                    action='get',
                    logger=lgr,
                    refds=refds_path):
                yield r
Beispiel #48
0
    def __call__(
        path=None,
        source=None,
        dataset=None,
        recursive=False,
        recursion_limit=None,
        get_data=True,
        description=None,
        reckless=False,
        #git_opts=None,
        #annex_opts=None,
        #annex_get_opts=None,
        jobs='auto',
        verbose=False,
    ):
        # IMPLEMENTATION CONCEPT:
        #
        # 1. Sort the world into existing handles and the rest
        # 2. Try locate missing handles (obtain subdatasets along the way)
        # 3. Expand into subdatasets with recursion enables (potentially
        #    obtain even more subdatasets
        # 4. Shoot info of which handles to get in each subdataset to,
        #    git-annex, once at the very end

        refds_path = Interface.get_refds_path(dataset)
        if not (dataset or path):
            raise InsufficientArgumentsError(
                "Neither dataset nor target path(s) provided")
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path

        # remember which results we already reported, to avoid duplicates
        yielded_ds = []
        to_get = []
        unavailable_paths = []
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='get',
                # NOTE: Do not act upon unavailable paths yet! Done below after
                # testing which ones could be obtained
                unavailable_path_status='',
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # we know what to report already
                yield ap
                continue
            if ap.get('state', None) == 'absent' and ap.get(
                    'raw_input', False):
                # if this wasn't found, but directly requested, queue for further
                # exploration
                unavailable_paths.append(ap)
                continue
            if ap.get('type', None) == 'dataset' and \
                    GitRepo.is_valid_repo(ap['path']) and \
                    not ap['path'] == refds_path:
                # do not report what hasn't arived yet
                # also do not report the base dataset that is already
                # present -- no surprise
                yield dict(ap,
                           status='notneeded',
                           logger=lgr,
                           message='already installed')
                yielded_ds.append(ap['path'])
                ap['process_content'] = get_data
            to_get.append(ap)

        # explore the unknown
        for ap in sorted(unavailable_paths, key=lambda x: x['path']):
            lgr.debug("Investigate yet unavailable path %s", ap)
            # how close can we get?
            dspath = ap.get('parentds', get_dataset_root(ap['path']))
            if dspath is None:
                # nothing we can do for this path
                continue
            lgr.debug("Found containing dataset %s for path %s", dspath,
                      ap['path'])
            ds = Dataset(dspath)
            # now actually obtain whatever is necessary to get to this path
            containing_ds = [dspath]
            for res in _install_necessary_subdatasets(ds,
                                                      ap['path'],
                                                      reckless,
                                                      refds_path,
                                                      description=description):
                # yield immediately so errors could be acted upon outside, before
                # we continue
                if not (res['type'] == 'dataset'
                        and res['path'] in yielded_ds):
                    # unless we reported on this dataset before
                    if res['type'] == 'dataset':
                        # make a record, recursive below might now want to report
                        # a 'notneeded'
                        yielded_ds.append(res['path'])
                    yield res
                # update to the current innermost dataset
                containing_ds.append(res['path'])

            if len(containing_ds) < 2:
                # no subdataset was installed, hence if the path was unavailable
                # before it still is, no need to bother git annex
                ap.update(status='impossible', message='path does not exist')
                yield ap
                continue
            # important to only do the next for the innermost subdataset
            # as the `recursive` logic below relies on that!
            # set the correct parent, for a dataset this would be the second-last
            # reported subdataset
            ap.update(parentds=containing_ds[-1])
            if containing_ds[-1] == ap['path']:
                # the path actually refers to the last installed dataset
                ap.update(parentds=containing_ds[-2],
                          process_content=get_data,
                          type='dataset')
            to_get.append(ap)

        # results of recursive installation of yet undiscovered datasets
        rec_get = []
        if recursive and not recursion_limit == 'existing':
            # obtain any subdatasets underneath the paths given inside the
            # subdatasets that we know already exist
            # unless we do not want recursion into not-yet-installed datasets
            for ap in sorted(to_get, key=lambda x: x['path']):
                if ap['type'] not in ('dataset', 'directory') or not ap.get(
                        'raw_input', False):
                    # a non-directory cannot have content underneath
                    # also we do NOT want to recurse into anything that was specifically
                    # requested, to avoid duplication
                    continue
                subds = Dataset(ap['path'] if ap['type'] ==
                                'dataset' else ap['parentds'])
                lgr.info("Installing %s%s recursively", subds,
                         (" underneath %s" %
                          ap['path'] if subds.path != ap['path'] else ""))
                for res in _recursive_install_subds_underneath(
                        subds,
                        # `ap['path']` was explicitly given as input
                        # we count recursions from the input, hence we
                        # can start with the full number
                        recursion_limit,
                        reckless,
                        start=ap['path'],
                        refds_path=refds_path,
                        description=description):
                    # yield immediately so errors could be acted upon
                    # outside, before we continue
                    if not (res['type'] == 'dataset'
                            and res['path'] in yielded_ds):
                        # unless we reported on this dataset before
                        if res['type'] == 'dataset':
                            # make a record
                            yielded_ds.append(res['path'])
                    yield res
                    if not (res['status'] == 'ok'
                            and res['type'] == 'dataset'):
                        # not a dataset that was just installed, we just reported it
                        # upstairs, and can ignore it from now on
                        continue
                    # paranoia, so popular these days...
                    assert GitRepo.is_valid_repo(res['path'])
                    # keep a copy of the install record for `get` later on
                    get_ap = {
                        k: v
                        for k, v in res.items() if not k == 'status'
                    }
                    get_ap['process_content'] = get_data
                    rec_get.append(get_ap)

        if not get_data:
            # done already
            return

        # merge the two AP lists
        to_get.extend(rec_get)

        # sort into datasets
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_get,
                refds_path=refds_path)
        assert (not completed)

        # hand over to git-annex, get files content,
        # report files in git as 'notneeded' to get
        for ds_path in sorted(content_by_ds.keys()):
            ds = Dataset(ds_path)
            # grab content, ignore subdataset entries
            content = [
                ap['path'] for ap in content_by_ds[ds_path]
                if ap.get('type', None) != 'dataset' or ap['path'] == ds.path
            ]
            if not content:
                # cut this short should there be nothing
                continue
            # needs to be an annex to get content
            if not isinstance(ds.repo, AnnexRepo):
                for r in results_from_paths(
                        content,
                        status='notneeded',
                        message="no dataset annex, content already present",
                        action='get',
                        logger=lgr,
                        refds=refds_path):
                    yield r
                continue
            respath_by_status = {}
            for res in ds.repo.get(content,
                                   options=['--from=%s' %
                                            source] if source else [],
                                   jobs=jobs):
                res = annexjson2result(res,
                                       ds,
                                       type='file',
                                       logger=lgr,
                                       refds=refds_path)
                success = success_status_map[res['status']]
                # TODO: in case of some failed commands (e.g. get) there might
                # be no path in the record.  yoh has only vague idea of logic
                # here so just checks for having 'path', but according to
                # results_from_annex_noinfo, then it would be assumed that
                # `content` was acquired successfully, which is not the case
                if 'path' in res:
                    respath_by_status[success] = \
                        respath_by_status.get(success, []) + [res['path']]
                yield res

            for r in results_from_annex_noinfo(
                    ds,
                    content,
                    respath_by_status,
                    dir_fail_msg='could not get some content in %s %s',
                    noinfo_dir_msg='nothing to get from %s',
                    noinfo_file_msg='already present',
                    action='get',
                    logger=lgr,
                    refds=refds_path):
                yield r
Beispiel #49
0
    def __call__(path=None,
                 dataset=None,
                 get_aggregates=False,
                 reporton='all',
                 recursive=False):
        # prep results
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='metadata', logger=lgr)
        if refds_path:
            res_kwargs['refds'] = refds_path

        if get_aggregates:
            # yield all datasets for which we have aggregated metadata as results
            # the get actual dataset results, so we can turn them into dataset
            # instances using generic top-level code if desired
            ds = require_dataset(refds_path,
                                 check_installed=True,
                                 purpose='aggregate metadata query')
            info_fpath = opj(ds.path, agginfo_relpath)
            if not exists(info_fpath):
                # if there has ever been an aggregation run, this file would
                # exist, hence there has not been and we need to tell this
                # to people
                yield get_status_dict(
                    ds=ds,
                    status='impossible',
                    action='metadata',
                    logger=lgr,
                    message=
                    'metadata aggregation has never been performed in this dataset'
                )
                return
            agginfos = _load_json_object(info_fpath)
            parentds = []
            for sd in sorted(agginfos):
                info = agginfos[sd]
                dspath = normpath(opj(ds.path, sd))
                if parentds and not path_is_subpath(dspath, parentds[-1]):
                    parentds.pop()
                info.update(
                    path=dspath,
                    type='dataset',
                    status='ok',
                )
                if sd == curdir:
                    info['layout_version'] = aggregate_layout_version
                if parentds:
                    info['parentds'] = parentds[-1]
                yield dict(info, **res_kwargs)
                parentds.append(dspath)
            return

        if not dataset and not path:
            # makes no sense to have no dataset, go with "here"
            # error generation happens during annotation
            path = curdir

        content_by_ds = OrderedDict()
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                # MIH: we are querying the aggregated metadata anyways, and that
                # mechanism has its own, faster way to go down the hierarchy
                #recursive=recursive,
                #recursion_limit=recursion_limit,
                action='metadata',
                # uninstalled subdatasets could be queried via aggregated metadata
                # -> no 'error'
                unavailable_path_status='',
                nondataset_path_status='error',
                # we need to know when to look into aggregated data
                force_subds_discovery=True,
                force_parentds_discovery=True,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo(
                    ap['path']):
                ap['process_content'] = True
            to_query = None
            if ap.get('state', None) == 'absent' or \
                    ap.get('type', 'dataset') != 'dataset':
                # this is a lonely absent dataset/file or content in a present dataset
                # -> query through parent
                # there must be a parent, otherwise this would be a non-dataset path
                # and would have errored during annotation
                to_query = ap['parentds']
            else:
                to_query = ap['path']
            if to_query:
                pcontent = content_by_ds.get(to_query, [])
                pcontent.append(ap)
                content_by_ds[to_query] = pcontent

        for ds_path in content_by_ds:
            ds = Dataset(ds_path)
            query_agg = [
                ap for ap in content_by_ds[ds_path]
                # this is an available subdataset, will be processed in another
                # iteration
                if ap.get('state', None) == 'absent' or not (ap.get(
                    'type', None) == 'dataset' and ap['path'] != ds_path)
            ]
            if not query_agg:
                continue
            # report from aggregated metadata
            for r in query_aggregated_metadata(
                    reporton,
                    # by default query the reference dataset, only if there is none
                    # try our luck in the dataset that contains the queried path
                    # this is consistent with e.g. `get_aggregates` reporting the
                    # situation in the reference dataset only
                    Dataset(refds_path) if refds_path else ds,
                    query_agg,
                    # recursion above could only recurse into datasets
                    # on the filesystem, but there might be any number of
                    # uninstalled datasets underneath the last installed one
                    # for which we might have metadata
                    recursive=recursive,
                    **res_kwargs):
                yield r
        return
Beispiel #50
0
    def repo(self):
        """Get an instance of the version control system/repo for this dataset,
        or None if there is none yet (or none anymore).

        If testing the validity of an instance of GitRepo is guaranteed to be
        really cheap this could also serve as a test whether a repo is present.

        Note, that this property is evaluated every time it is used. If used
        multiple times within a function it's probably a good idea to store its
        value in a local variable and use this variable instead.

        Returns
        -------
        GitRepo or AnnexRepo
        """

        # If we already got a *Repo instance, check whether it's still valid;
        # Note, that this basically does part of the testing that would
        # (implicitly) be done in the loop below again. So, there's still
        # potential to speed up when we actually need to get a new instance
        # (or none). But it's still faster for the vast majority of cases.
        #
        # TODO: Dig deeper into it and melt with new instance guessing. This
        # should also involve to reduce redundancy of testing such things from
        # within Flyweight.__call__, AnnexRepo.__init__ and GitRepo.__init__!
        #
        # Also note, that this could be forged into a single big condition, but
        # that is hard to read and we should be well aware of the actual
        # criteria here:
        if self._repo is not None and realpath(self.path) == self._repo.path:
            # we got a repo and path references still match
            if isinstance(self._repo, AnnexRepo):
                # it's supposed to be an annex
                if self._repo is AnnexRepo._unique_instances.get(
                        self._repo.path, None) and \
                        AnnexRepo.is_valid_repo(self._repo.path,
                                                allow_noninitialized=True):
                    # it's still the object registered as flyweight and it's a
                    # valid annex repo
                    return self._repo
            elif isinstance(self._repo, GitRepo):
                # it's supposed to be a plain git
                if self._repo is GitRepo._unique_instances.get(
                        self._repo.path, None) and \
                        GitRepo.is_valid_repo(self._repo.path) and not \
                        self._repo.is_with_annex():
                    # it's still the object registered as flyweight, it's a
                    # valid git repo and it hasn't turned into an annex
                    return self._repo

        # Note: Although it looks like the "self._repo = None" assignments
        # could be used instead of variable "valid", that's a big difference!
        # The *Repo instances are flyweights, not singletons. self._repo might
        # be the last reference, which would lead to those objects being
        # destroyed and therefore the constructor call would result in an
        # actually new instance. This is unnecessarily costly.
        valid = False
        for cls, ckw, kw in (
                # TODO: Do we really want to allow_noninitialized=True here?
                # And if so, leave a proper comment!
                (AnnexRepo, {'allow_noninitialized': True}, {'init': False}),
                (GitRepo, {}, {})
        ):
            if cls.is_valid_repo(self._path, **ckw):
                try:
                    lgr.log(5, "Detected %s at %s", cls, self._path)
                    self._repo = cls(self._path, create=False, **kw)
                    valid = True
                    break
                except (InvalidGitRepositoryError, NoSuchPathError,
                        InvalidAnnexRepositoryError) as exc:
                    lgr.log(5,
                            "Oops -- guess on repo type was wrong?: %s",
                            exc_str(exc))

        if not valid:
            self._repo = None

        if self._repo is None:
            # Often .repo is requested to 'sense' if anything is installed
            # under, and if so -- to proceed forward. Thus log here only
            # at DEBUG level and if necessary "complaint upstairs"
            lgr.log(5, "Failed to detect a valid repo at %s", self.path)

        return self._repo
Beispiel #51
0
def _get_submodules(ds, paths, fulfilled, recursive, recursion_limit,
                    contains, bottomup, set_property, delete_property,
                    refds_path):
    dspath = ds.path
    if not GitRepo.is_valid_repo(dspath):
        return
    modinfo = _parse_gitmodules(ds)
    # put in giant for-loop to be able to yield results before completion
    for sm in _parse_git_submodules(ds, paths):
        if contains and not any(
                sm['path'] == c or sm['path'] in c.parents for c in contains):
            # we are not looking for this subds, because it doesn't
            # match the target path
            continue
        # do we just need this to recurse into subdatasets, or is this a
        # real results?
        to_report = paths is None \
            or any(p == sm['path'] or p in sm['path'].parents
                   for p in paths)
        sm.update(modinfo.get(sm['path'], {}))
        if to_report and (set_property or delete_property):
            # first deletions
            for dprop in assure_list(delete_property):
                try:
                    out, err = ds.repo._git_custom_command(
                        '', ['git', 'config', '--file', '.gitmodules',
                             '--unset-all',
                             'submodule.{}.{}'.format(sm['gitmodule_name'], dprop),
                            ]
                    )
                except CommandError:
                    yield get_status_dict(
                        'subdataset',
                        status='impossible',
                        message=(
                            "Deleting subdataset property '%s' failed for "
                            "subdataset '%s', possibly did "
                            "not exist",
                            dprop, sm['gitmodule_name']),
                        logger=lgr,
                        **sm)
                # also kick from the info we just read above
                sm.pop('gitmodule_{}'.format(dprop), None)
            # and now setting values
            for sprop in assure_list(set_property):
                prop, val = sprop
                if val.startswith('<') and val.endswith('>') and '{' in val:
                    # expand template string
                    val = val[1:-1].format(
                        **dict(
                            sm,
                            refds_relpath=sm['path'].relative_to(refds_path),
                            refds_relname=text_type(
                                sm['path'].relative_to(refds_path)
                            ).replace(os.sep, '-')))
                try:
                    out, err = ds.repo._git_custom_command(
                        '', ['git', 'config', '--file', '.gitmodules',
                             '--replace-all',
                             'submodule.{}.{}'.format(sm['gitmodule_name'], prop),
                             text_type(val),
                            ]
                    )
                except CommandError as e:  # pragma: no cover
                    # this conditional may not be possible to reach, as
                    # variable name validity is checked before and Git
                    # replaces the file completely, resolving any permission
                    # issues, if the file could be read (already done above)
                    yield get_status_dict(
                        'subdataset',
                        status='error',
                        message=(
                            "Failed to set property '%s': %s",
                            prop, exc_str(e)),
                        type='dataset',
                        logger=lgr,
                        **sm)
                    # it is up to parent code to decide whether we would continue
                    # after this

                # also add to the info we just read above
                sm['gitmodule_{}'.format(prop)] = val
            Dataset(dspath).save(
                '.gitmodules', to_git=True,
                message='[DATALAD] modified subdataset properties')

        #common = commonprefix((with_pathsep(subds), with_pathsep(path)))
        #if common.endswith(sep) and common == with_pathsep(subds):
        #    candidates.append(common)
        subdsres = get_status_dict(
            'subdataset',
            status='ok',
            type='dataset',
            logger=lgr)
        subdsres.update(sm)
        subdsres['parentds'] = dspath
        if to_report and (not bottomup and \
                (fulfilled is None or
                 GitRepo.is_valid_repo(sm['path']) == fulfilled)):
            yield subdsres

        # expand list with child submodules. keep all paths relative to parent
        # and convert jointly at the end
        if recursive and \
                (recursion_limit in (None, 'existing') or
                 (isinstance(recursion_limit, int) and
                  recursion_limit > 1)):
            for r in _get_submodules(
                    Dataset(sm['path']),
                    paths,
                    fulfilled, recursive,
                    (recursion_limit - 1)
                    if isinstance(recursion_limit, int)
                    else recursion_limit,
                    contains,
                    bottomup,
                    set_property,
                    delete_property,
                    refds_path):
                yield r
        if to_report and (bottomup and \
                (fulfilled is None or
                 GitRepo.is_valid_repo(sm['path']) == fulfilled)):
            yield subdsres
Beispiel #52
0
def _get_submodules(dspath, fulfilled, recursive, recursion_limit,
                    contains, bottomup, set_property, delete_property,
                    refds_path):
    if not GitRepo.is_valid_repo(dspath):
        return
    modinfo = _parse_gitmodules(dspath)
    # write access parser
    parser = None
    # TODO bring back in more global scope from below once segfaults are
    # figured out
    #if set_property or delete_property:
    #    gitmodule_path = opj(dspath, ".gitmodules")
    #    parser = GitConfigParser(
    #        gitmodule_path, read_only=False, merge_includes=False)
    #    parser.read()
    # put in giant for-loop to be able to yield results before completion
    for sm in _parse_git_submodules(dspath):
        if contains and not path_startswith(contains, sm['path']):
            # we are not looking for this subds, because it doesn't
            # match the target path
            continue
        sm.update(modinfo.get(sm['path'], {}))
        if set_property or delete_property:
            gitmodule_path = opj(dspath, ".gitmodules")
            parser = GitConfigParser(
                gitmodule_path, read_only=False, merge_includes=False)
            parser.read()
            # do modifications now before we read the info out for reporting
            # use 'submodule "NAME"' section ID style as this seems to be the default
            submodule_section = 'submodule "{}"'.format(sm['gitmodule_name'])
            # first deletions
            for dprop in assure_list(delete_property):
                parser.remove_option(submodule_section, dprop)
                # also kick from the info we just read above
                sm.pop('gitmodule_{}'.format(dprop), None)
            # and now setting values
            for sprop in assure_list(set_property):
                prop, val = sprop
                if val.startswith('<') and val.endswith('>') and '{' in val:
                    # expand template string
                    val = val[1:-1].format(
                        **dict(
                            sm,
                            refds_relpath=relpath(sm['path'], refds_path),
                            refds_relname=relpath(sm['path'], refds_path).replace(os.sep, '-')))
                parser.set_value(
                    submodule_section,
                    prop,
                    val)
                # also add to the info we just read above
                sm['gitmodule_{}'.format(prop)] = val
            Dataset(dspath).add(
                '.gitmodules', to_git=True,
                message='[DATALAD] modified subdataset properties')
            # let go of resources, locks, ...
            parser.release()

        #common = commonprefix((with_pathsep(subds), with_pathsep(path)))
        #if common.endswith(sep) and common == with_pathsep(subds):
        #    candidates.append(common)
        subdsres = get_status_dict(
            'subdataset',
            status='ok',
            type='dataset',
            logger=lgr)
        subdsres.update(sm)
        subdsres['parentds'] = dspath
        if not bottomup and \
                (fulfilled is None or
                 GitRepo.is_valid_repo(sm['path']) == fulfilled):
            yield subdsres

        # expand list with child submodules. keep all paths relative to parent
        # and convert jointly at the end
        if recursive and \
                (recursion_limit in (None, 'existing') or
                 (isinstance(recursion_limit, int) and
                  recursion_limit > 1)):
            for r in _get_submodules(
                    sm['path'],
                    fulfilled, recursive,
                    (recursion_limit - 1)
                    if isinstance(recursion_limit, int)
                    else recursion_limit,
                    contains,
                    bottomup,
                    set_property,
                    delete_property,
                    refds_path):
                yield r
        if bottomup and \
                (fulfilled is None or
                 GitRepo.is_valid_repo(sm['path']) == fulfilled):
            yield subdsres
    if parser is not None:
        # release parser lock manually, auto-cleanup is not reliable in PY3
        parser.release()
Beispiel #53
0
    def __call__(
            path=None,
            dataset=None,
            # support passing this through in a path by path basis
            to_git=None,
            save=True,
            message=None,
            message_file=None,
            recursive=False,
            recursion_limit=None,
            ds2super=False,
            git_opts=None,
            annex_opts=None,
            annex_add_opts=None,
            jobs=None):
        # parameter constraints:
        if not path:
            raise InsufficientArgumentsError(
                "insufficient information for adding: requires at least a path")
        refds_path = Interface.get_refds_path(dataset)
        common_report = dict(action='add', logger=lgr, refds=refds_path)

        if message and message_file:
            raise ValueError("Both a message and message file were specified")

        if message_file:
            with open(message_file, "rb") as mfh:
                message = assure_unicode(mfh.read())

        to_add = []
        subds_to_add = {}
        ds_to_annotate_from_recursion = {}
        got_nothing = True
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=dataset,
                # never recursion, need to handle manually below to be able to
                # discover untracked content
                recursive=False,
                action='add',
                # speed things up by using Git's modification detection, if there
                # is a repo with at least one commit
                modified='HEAD' \
                if dataset and \
                GitRepo.is_valid_repo(refds_path) and \
                GitRepo(refds_path).get_hexsha() \
                else None,
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist: %s",
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            got_nothing = False
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset':
                yield get_status_dict(
                    status='impossible',
                    message='"there is no dataset to add this path to',
                    **dict(common_report, **ap))
                continue
            if ap.get('type', None) == 'directory' and \
                    ap.get('state', None) == 'untracked' and \
                    GitRepo.is_valid_repo(ap['path']):
                # this is an untracked wannabe subdataset in disguise
                ap['type'] = 'dataset'
            if recursive and \
                    (ap.get('raw_input', False) or
                     ap.get('state', None) in ('added', 'modified', 'untracked')) and \
                    (ap.get('parentds', None) or ap.get('type', None) == 'dataset'):
                # this was an actually requested input path, or a path that was found
                # modified by path annotation, based on an input argument
                # we need to recurse into all subdirs to find potentially
                # unregistered subdatasets
                # but only if this path has a parent, or is itself a dataset
                # otherwise there is nothing to add to
                _discover_subdatasets_recursively(
                    ds_to_annotate_from_recursion,
                    ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']],
                    recursion_limit)
                # get the file content of the root dataset of this search added too
                # but be careful with extreme recursion_limit settings
                if recursion_limit is None or recursion_limit > 0:
                    ap['process_content'] = True
            # record for further processing
            if not ap['path'] in ds_to_annotate_from_recursion:
                # if it was somehow already discovered
                to_add.append(ap)
        if got_nothing:
            # path annotation yielded nothing, most likely cause is that nothing
            # was found modified, we need to say something about the reference
            # dataset
            yield get_status_dict(
                'add',
                status='notneeded',
                path=refds_path,
                type='dataset',
                logger=lgr)
            return

        for subds in ds_to_annotate_from_recursion:
            if subds not in subds_to_add:
                # always prefer the already annotated path
                subds_to_add[subds] = ds_to_annotate_from_recursion[subds]

        if dataset:
            # we have a base dataset, discover any intermediate datasets between
            # the base and any already discovered dataset
            discovered = {}
            discover_dataset_trace_to_targets(
                # from here
                dataset.path,
                # to any dataset we are aware of
                subds_to_add.keys(),
                [],
                discovered)
            for parentds in discovered:
                for subds in discovered[parentds]:
                    subds_to_add[subds] = subds_to_add.get(
                        subds,
                        dict(path=subds, parentds=parentds, type='dataset'))

        # merge custom paths and discovered dataset records, paths needs to go first,
        # because we know most about then, and subsequent annotation call we skip the
        # later duplicate ones
        to_add.extend(subds_to_add.values())
        # and compact, this should be OK as all the info is in each ap dict
        to_add = unique(to_add, lambda x: x['path'])

        if not to_add:
            # nothing left to do, potentially all errored before
            return

        # now re-annotate all paths, this will be fast for already annotated ones
        # and will amend the annotation for others, it will also deduplicate
        annotated_paths = AnnotatePaths.__call__(
            path=to_add,
            dataset=dataset,
            # never recursion, done already
            recursive=False,
            action='add',
            unavailable_path_status='impossible',
            unavailable_path_msg="path does not exist: %s",
            nondataset_path_status='impossible',
            return_type='generator',
            # if there is an error now, we made this mistake in here
            on_failure='stop')

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                annotated_paths,
                refds_path=refds_path)
        assert(not completed)

        if not content_by_ds:
            # we should have complained about any inappropriate path argument
            # above, so if nothing is left, we can simply exit
            return

        # simple loop over datasets -- save happens later
        # start deep down
        to_save = []
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            torepoadd = {}
            respath_by_status = {}
            for ap in content_by_ds[ds_path]:
                # we have a new story
                ap.pop('status', None)
                torepoadd[ap['path']] = ap

                # skip anything that doesn't look like a wannabe subdataset
                if not ap.get('type', None) == 'dataset' or \
                        ap['path'] == ds_path:
                    continue

                if ap.get('registered_subds', False):
                    # subdataset that might be in this list because of the
                    # need to save all the way up to a super dataset
                    respath_by_status['success'] = \
                        respath_by_status.get('success', []) + [ap['path']]
                    yield get_status_dict(
                        status='notneeded',
                        message="already known subdataset",
                        **dict(common_report, **ap))
                    continue
                subds = Dataset(ap['path'])
                subds_relpath = relpath(ap['path'], ds_path)
                # Register the repository in the repo tree as a submodule
                try:
                    ds.repo.add_submodule(subds_relpath, url=None, name=None)
                except (CommandError, InvalidGitRepositoryError) as e:
                    yield get_status_dict(
                        ds=subds, status='error', message=e.stderr,
                        **dict(common_report, **ap))
                    continue
                # queue for saving using the updated annotated path
                ap['registered_subds'] = True
                # I hope this is true in direct mode too
                # TODO this is disabled, because in some circumstances
                # staging just doesn't happen, and it is unclear when
                # exactly -- the case that prompted disabling was a submodule
                # that had no content except for other submodules was not staged,
                # whereas another submodule on the same level in the same
                # superdataset which also has one file in it was staged
                # disable to work correctly, while paying a little bit of
                # slow down
                #ap['staged'] = True
                to_save.append(ap)
                # report added subdatasets -- `annex add` below won't do it
                yield get_status_dict(
                    ds=subds,
                    status='ok',
                    message='added new subdataset',
                    **dict(common_report, **ap))
                # make sure that .gitmodules is added to the list of files
                gitmodules_path = opj(ds.path, '.gitmodules')
                # for git
                torepoadd[gitmodules_path] = dict(path=gitmodules_path)
                # and for save
                to_save.append(dict(
                    path=gitmodules_path,
                    parentds=ds_path,
                    type='file'))
            # make sure any last minute additions make it to the saving stage
            # XXX? should content_by_ds become OrderedDict so that possible
            # super here gets processed last?
            lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd)
            is_annex = isinstance(ds.repo, AnnexRepo)
            add_kw = {'jobs': jobs} if is_annex and jobs else {}
            added = ds.repo.add_(
                list(torepoadd.keys()),
                git=to_git if is_annex else True,
                **add_kw
            )
            for a in added:
                res = annexjson2result(a, ds, type='file', **common_report)
                success = success_status_map[res['status']]
                respath_by_status[success] = \
                    respath_by_status.get(success, []) + [res['path']]
                # produce best possible path/result annotation
                if res['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    res = dict(torepoadd[res['path']], **res)
                # override this in all cases to be safe
                res['parentds'] = ds.path
                if success:
                    # this was successfully added, queue for saving this very path
                    # in the dataset
                    ap = {k: v for k, v in res.items() if k != 'status'}
                    ap['staged'] = True
                    # strip any status and state info (e.g. save will refuse to save
                    # stuff that is marked state='untracked'
                    to_save.append({k: v for k, v in res.items()
                                    if k not in ('status', 'state')})
                if a['file'] == '.gitmodules':
                    # filter out .gitmodules, because this is only included for
                    # technical reasons and has nothing to do with the actual content
                    continue
                if GitRepo.is_valid_repo(res['path']):
                    # more accurate report in case of an added submodule
                    # mountpoint.
                    # XXX Actually not sure if this can really happen
                    # (depends on what our low-level code would do)
                    # but worst case is that we loose a little bit of
                    # coverage...
                    res['type'] = 'dataset'
                    res['message'] = 'added new state as submodule'
                yield res

            for r in results_from_annex_noinfo(
                    ds, torepoadd, respath_by_status,
                    dir_fail_msg='could not add some content in %s %s',
                    noinfo_dir_msg='nothing to add from %s',
                    noinfo_file_msg='already included in the dataset',
                    action='add',
                    logger=lgr,
                    refds=refds_path):
                if r['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    r = dict(r, **torepoadd[r['path']])

                if r['status'] == 'notneeded':
                    # this could be a file that was staged already, it doesn't need
                    # to be added, but it should be saved/commited if so desired
                    to_save.append({k: v for k, v in r.items()
                                    if k not in ('status', 'state')})

                # XXX something is fishy with the next one, rethink when sober....
                if r['path'] == ds_path and r['status'] == 'ok':
                    # this is for the entire dataset itself which was explicitly requested
                    # make sure to save all
                    r['type'] = 'dataset'
                    r['process_content'] = True
                    to_save.append({k: v for k, v in r.items() if k != 'status'})
                yield r
            if refds_path and ds_path != refds_path and len(respath_by_status.get('success', [])):
                # TODO XXX we have an issue here when with `add('.')` and annex ignores any
                # dotfiles. In this case we end up not saving a dataset completely, because
                # we rely on accurate reporting. there is an issue about this already
                # TODO look up the issue ID
                # if there is a base dataset, but we are below it, and we have anything done to this
                # dataset -> queue dataset itself for saving its state in the parent
                ds_ap = dict(
                    path=ds.path,
                    # we have to look for the parent here, as we must save the
                    # subdataset in the parent and not the whole subdataset itself
                    type='dataset')
                parentds = get_dataset_root(normpath(opj(ds.path, pardir)))
                if parentds:
                    ds_ap['parentds'] = parentds
                if dataset:
                    ds_ap['refds'] = refds_path
                to_save.append(ds_ap)

        if not save:
            lgr.debug('Not calling `save` as instructed')
            return

        # TODO tell save what was staged already! Set 'staged=True' for
        # respective annotated paths that are fed into `save`

        # do not reuse any of the sorting done in here for saving, but instead
        # pass on all the annotated paths to have `save` figure out what to do with
        # them -- this is costs something, but should be safer, and frankly is
        # more comprehensible
        for res in Save.__call__(
                # hand-selected annotated paths
                path=to_save,
                dataset=refds_path,
                message=message if message else '[DATALAD] added content',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Beispiel #54
0
    def __call__(
            path=None,
            dataset=None,
            # support passing this through in a path by path basis
            to_git=None,
            save=True,
            message=None,
            recursive=False,
            recursion_limit=None,
            ds2super=False,
            git_opts=None,
            annex_opts=None,
            annex_add_opts=None,
            jobs=None):
        # parameter constraints:
        if not path:
            raise InsufficientArgumentsError(
                "insufficient information for adding: requires at least a path"
            )
        refds_path = Interface.get_refds_path(dataset)
        common_report = dict(action='add', logger=lgr, refds=refds_path)

        to_add = []
        subds_to_add = {}
        ds_to_annotate_from_recursion = {}
        got_nothing = True
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=dataset,
                # never recursion, need to handle manually below to be able to
                # discover untracked content
                recursive=False,
                action='add',
                # speed things up by using Git's modification detection, if there
                # is a repo with at least one commit
                modified='HEAD' \
                if dataset and \
                GitRepo.is_valid_repo(refds_path) and \
                GitRepo(refds_path).get_hexsha() \
                else None,
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist: %s",
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            got_nothing = False
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('parentds', None) is None and ap.get('type',
                                                           None) != 'dataset':
                yield get_status_dict(
                    status='impossible',
                    message='"there is no dataset to add this path to',
                    **dict(common_report, **ap))
                continue
            if ap.get('type', None) == 'directory' and \
                    ap.get('state', None) == 'untracked' and \
                    GitRepo.is_valid_repo(ap['path']):
                # this is an untracked wannabe subdataset in disguise
                ap['type'] = 'dataset'
            if recursive and \
                    (ap.get('raw_input', False) or
                     ap.get('state', None) in ('added', 'modified', 'untracked')) and \
                    (ap.get('parentds', None) or ap.get('type', None) == 'dataset'):
                # this was an actually requested input path, or a path that was found
                # modified by path annotation, based on an input argument
                # we need to recurse into all subdirs to find potentially
                # unregistered subdatasets
                # but only if this path has a parent, or is itself a dataset
                # otherwise there is nothing to add to
                _discover_subdatasets_recursively(
                    ds_to_annotate_from_recursion, ap['path'],
                    [ap['parentds'] if 'parentds' in ap else ap['path']],
                    recursion_limit)
                # get the file content of the root dataset of this search added too
                # but be careful with extreme recursion_limit settings
                if recursion_limit is None or recursion_limit > 0:
                    ap['process_content'] = True
            # record for further processing
            if not ap['path'] in ds_to_annotate_from_recursion:
                # if it was somehow already discovered
                to_add.append(ap)
            # TODO check if next isn't covered by discover_dataset_trace_to_targets already??
            if dataset and ap.get('type', None) == 'dataset':
                # duplicates not possible, annotated_paths returns unique paths
                subds_to_add[ap['path']] = ap
        if got_nothing:
            # path annotation yielded nothing, most likely cause is that nothing
            # was found modified, we need to say something about the reference
            # dataset
            yield get_status_dict('add',
                                  status='notneeded',
                                  path=refds_path,
                                  type='dataset',
                                  logger=lgr)
            return

        for subds in ds_to_annotate_from_recursion:
            if subds not in subds_to_add:
                # always prefer the already annotated path
                subds_to_add[subds] = ds_to_annotate_from_recursion[subds]

        if dataset:
            # we have a base dataset, discover any intermediate datasets between
            # the base and any already discovered dataset
            discovered = {}
            discover_dataset_trace_to_targets(
                # from here
                dataset.path,
                # to any dataset we are aware of
                subds_to_add.keys(),
                [],
                discovered)
            for parentds in discovered:
                for subds in discovered[parentds]:
                    subds_to_add[subds] = subds_to_add.get(
                        subds,
                        dict(path=subds, parentds=parentds, type='dataset'))

        # merge custom paths and discovered dataset records, paths needs to go first,
        # because we know most about then, and subsequent annotation call we skip the
        # later duplicate ones
        to_add.extend(subds_to_add.values())
        # and compact, this should be OK as all the info is in each ap dict
        to_add = unique(to_add, lambda x: x['path'])

        if not to_add:
            # nothing left to do, potentially all errored before
            return

        # now re-annotate all paths, this will be fast for already annotated ones
        # and will amend the annotation for others, it will also deduplicate
        annotated_paths = AnnotatePaths.__call__(
            path=to_add,
            dataset=dataset,
            # never recursion, done already
            recursive=False,
            action='add',
            unavailable_path_status='impossible',
            unavailable_path_msg="path does not exist: %s",
            nondataset_path_status='impossible',
            return_type='generator',
            # if there is an error now, we made this mistake in here
            on_failure='stop')

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                annotated_paths,
                refds_path=refds_path)
        assert (not completed)

        if not content_by_ds:
            # we should have complained about any inappropriate path argument
            # above, so if nothing is left, we can simply exit
            return

        # simple loop over datasets -- save happens later
        # start deep down
        to_save = []
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            torepoadd = {}
            respath_by_status = {}
            for ap in content_by_ds[ds_path]:
                # we have a new story
                ap.pop('status', None)
                torepoadd[ap['path']] = ap

                # skip anything that doesn't look like a wannabe subdataset
                if not ap.get('type', None) == 'dataset' or \
                        ap['path'] == ds_path:
                    continue

                if ap.get('registered_subds', False):
                    # subdataset that might be in this list because of the
                    # need to save all the way up to a super dataset
                    respath_by_status['success'] = \
                        respath_by_status.get('success', []) + [ap['path']]
                    yield get_status_dict(status='notneeded',
                                          message="already known subdataset",
                                          **dict(common_report, **ap))
                    continue
                subds = Dataset(ap['path'])
                # check that the subds has a commit, and refuse
                # to operate on it otherwise, or we would get a bastard
                # submodule that cripples git operations
                if not subds.repo.get_hexsha():
                    yield get_status_dict(
                        ds=subds,
                        status='impossible',
                        message='cannot add subdataset with no commits',
                        **dict(common_report, **ap))
                    continue
                subds_relpath = relpath(ap['path'], ds_path)
                # make an attempt to configure a submodule source URL based on the
                # discovered remote configuration
                remote, branch = subds.repo.get_tracking_branch()
                subds_url = subds.repo.get_remote_url(
                    remote) if remote else None
                # Register the repository in the repo tree as a submodule
                try:
                    ds.repo.add_submodule(subds_relpath,
                                          url=subds_url,
                                          name=None)
                except CommandError as e:
                    yield get_status_dict(ds=subds,
                                          status='error',
                                          message=e.stderr,
                                          **dict(common_report, **ap))
                    continue
                # queue for saving using the updated annotated path
                ap['registered_subds'] = True
                # I hope this is true in direct mode too
                # TODO this is disabled, because in some circumstances
                # staging just doesn't happen, and it is unclear when
                # exactly -- the case that prompted disabling was a submodule
                # that had no content except for other submodules was not staged,
                # whereas another submodule on the same level in the same
                # superdataset which also has one file in it was staged
                # disable to work correctly, while paying a little bit of
                # slow down
                #ap['staged'] = True
                to_save.append(ap)
                _fixup_submodule_dotgit_setup(ds, subds_relpath)
                # report added subdatasets -- `annex add` below won't do it
                yield get_status_dict(ds=subds,
                                      status='ok',
                                      message='added new subdataset',
                                      **dict(common_report, **ap))
                # make sure that .gitmodules is added to the list of files
                gitmodules_path = opj(ds.path, '.gitmodules')
                # for git
                torepoadd[gitmodules_path] = dict(path=gitmodules_path)
                # and for save
                to_save.append(
                    dict(path=gitmodules_path, parentds=ds_path, type='file'))
            # make sure any last minute additions make it to the saving stage
            # XXX? should content_by_ds become OrderedDict so that possible
            # super here gets processed last?
            lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd)
            is_annex = isinstance(ds.repo, AnnexRepo)
            add_kw = {'jobs': jobs} if is_annex and jobs else {}
            added = ds.repo.add(list(torepoadd.keys()),
                                git=to_git if is_annex else True,
                                **add_kw)
            for a in added:
                res = annexjson2result(a, ds, type='file', **common_report)
                success = success_status_map[res['status']]
                respath_by_status[success] = \
                    respath_by_status.get(success, []) + [res['path']]
                # produce best possible path/result annotation
                if res['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    res = dict(torepoadd[res['path']], **res)
                # override this in all cases to be safe
                res['parentds'] = ds.path
                if success:
                    # this was successfully added, queue for saving this very path
                    # in the dataset
                    ap = {k: v for k, v in res.items() if k != 'status'}
                    ap['staged'] = True
                    # strip any status and state info (e.g. save will refuse to save
                    # stuff that is marked state='untracked'
                    to_save.append({
                        k: v
                        for k, v in res.items() if k not in ('status', 'state')
                    })
                if a['file'] == '.gitmodules':
                    # filter out .gitmodules, because this is only included for
                    # technical reasons and has nothing to do with the actual content
                    continue
                if GitRepo.is_valid_repo(res['path']):
                    # more accurate report in case of an added submodule
                    # mountpoint.
                    # XXX Actually not sure if this can really happen
                    # (depends on what our low-level code would do)
                    # but worst case is that we loose a little bit of
                    # coverage...
                    res['type'] = 'dataset'
                    res['message'] = 'added new state as submodule'
                yield res

            for r in results_from_annex_noinfo(
                    ds,
                    torepoadd,
                    respath_by_status,
                    dir_fail_msg='could not add some content in %s %s',
                    noinfo_dir_msg='nothing to add from %s',
                    noinfo_file_msg='already included in the dataset',
                    action='add',
                    logger=lgr,
                    refds=refds_path):
                if r['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    r = dict(r, **torepoadd[r['path']])

                if r['status'] == 'notneeded':
                    # this could be a file that was staged already, it doesn't need
                    # to be added, but it should be saved/commited if so desired
                    to_save.append({
                        k: v
                        for k, v in r.items() if k not in ('status', 'state')
                    })

                # XXX something is fishy with the next one, rethink when sober....
                if r['path'] == ds_path and r['status'] == 'ok':
                    # this is for the entire dataset itself which was explicitly requested
                    # make sure to save all
                    r['type'] = 'dataset'
                    r['process_content'] = True
                    to_save.append(
                        {k: v
                         for k, v in r.items() if k != 'status'})
                yield r
            if refds_path and ds_path != refds_path and len(
                    respath_by_status.get('success', [])):
                # TODO XXX we have an issue here when with `add('.')` and annex ignores any
                # dotfiles. In this case we end up not saving a dataset completely, because
                # we rely on accurate reporting. there is an issue about this already
                # TODO look up the issue ID
                # if there is a base dataset, but we are below it, and we have anything done to this
                # dataset -> queue dataset itself for saving its state in the parent
                ds_ap = dict(
                    path=ds.path,
                    # we have to look for the parent here, as we must save the
                    # subdataset in the parent and not the whole subdataset itself
                    type='dataset')
                parentds = get_dataset_root(normpath(opj(ds.path, pardir)))
                if parentds:
                    ds_ap['parentds'] = parentds
                if dataset:
                    ds_ap['refds'] = refds_path
                to_save.append(ds_ap)

        if not save:
            lgr.debug('Not calling `save` as instructed')
            return

        # TODO tell save what was staged already! Set 'staged=True' for
        # respective annotated paths that are fed into `save`

        # do not reuse any of the sorting done in here for saving, but instead
        # pass on all the annotated paths to have `save` figure out what to do with
        # them -- this is costs something, but should be safer, and frankly is
        # more comprehensible
        for res in Save.__call__(
                # hand-selected annotated paths
                path=to_save,
                dataset=refds_path,
                message=message if message else '[DATALAD] added content',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Beispiel #55
0
    def repo(self):
        """Get an instance of the version control system/repo for this dataset,
        or None if there is none yet (or none anymore).

        If testing the validity of an instance of GitRepo is guaranteed to be
        really cheap this could also serve as a test whether a repo is present.

        Note, that this property is evaluated every time it is used. If used
        multiple times within a function it's probably a good idea to store its
        value in a local variable and use this variable instead.

        Returns
        -------
        GitRepo or AnnexRepo
        """

        # If we already got a *Repo instance, check whether it's still valid;
        # Note, that this basically does part of the testing that would
        # (implicitly) be done in the loop below again. So, there's still
        # potential to speed up when we actually need to get a new instance
        # (or none). But it's still faster for the vast majority of cases.
        #
        # TODO: Dig deeper into it and melt with new instance guessing. This
        # should also involve to reduce redundancy of testing such things from
        # within Flyweight.__call__, AnnexRepo.__init__ and GitRepo.__init__!
        #
        # Also note, that this could be forged into a single big condition, but
        # that is hard to read and we should be well aware of the actual
        # criteria here:
        if self._repo is not None and realpath(self.path) == self._repo.path:
            # we got a repo and path references still match
            if isinstance(self._repo, AnnexRepo):
                # it's supposed to be an annex
                if self._repo is AnnexRepo._unique_instances.get(
                        self._repo.path, None) and \
                        AnnexRepo.is_valid_repo(self._repo.path,
                                                allow_noninitialized=True):
                    # it's still the object registered as flyweight and it's a
                    # valid annex repo
                    return self._repo
            elif isinstance(self._repo, GitRepo):
                # it's supposed to be a plain git
                if self._repo is GitRepo._unique_instances.get(
                        self._repo.path, None) and \
                        GitRepo.is_valid_repo(self._repo.path) and not \
                        self._repo.is_with_annex():
                    # it's still the object registered as flyweight, it's a
                    # valid git repo and it hasn't turned into an annex
                    return self._repo

        # Note: Although it looks like the "self._repo = None" assignments
        # could be used instead of variable "valid", that's a big difference!
        # The *Repo instances are flyweights, not singletons. self._repo might
        # be the last reference, which would lead to those objects being
        # destroyed and therefore the constructor call would result in an
        # actually new instance. This is unnecessarily costly.
        valid = False
        for cls, ckw, kw in (
                # TODO: Do we really want to allow_noninitialized=True here?
                # And if so, leave a proper comment!
                (AnnexRepo, {'allow_noninitialized': True}, {'init': False}),
                (GitRepo, {}, {})
        ):
            if cls.is_valid_repo(self._path, **ckw):
                try:
                    lgr.log(5, "Detected %s at %s", cls, self._path)
                    self._repo = cls(self._path, create=False, **kw)
                    valid = True
                    break
                except (InvalidGitRepositoryError, NoSuchPathError,
                        InvalidAnnexRepositoryError) as exc:
                    lgr.log(5,
                            "Oops -- guess on repo type was wrong?: %s",
                            exc_str(exc))

        if not valid:
            self._repo = None

        if self._repo is None:
            # Often .repo is requested to 'sense' if anything is installed
            # under, and if so -- to proceed forward. Thus log here only
            # at DEBUG level and if necessary "complaint upstairs"
            lgr.log(5, "Failed to detect a valid repo at %s", self.path)
        elif due.active:
            # Makes sense only on installed dataset - @never_fail'ed
            duecredit_dataset(self)

        return self._repo
Beispiel #56
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            action=None,
            unavailable_path_status='',
            unavailable_path_msg=None,
            nondataset_path_status='error',
            force_parentds_discovery=True,
            force_subds_discovery=True,
            force_no_revision_change_discovery=True,
            force_untracked_discovery=True,
            modified=None):
        # upfront check for the fastest possible response
        if not path and dataset is None:
            # nothing given, try "here", but do not use `require_dataset`, as
            # it will determine the root dataset of `curdir` and further down
            # lead to path annotation of upstairs directories
            dataset = curdir

        if force_subds_discovery and not force_parentds_discovery:
            raise ValueError(
                'subdataset discovery requires parent dataset discovery')

        # CONCEPT: yield with no status to indicate further processing

        # everything in one big loop to be able too yield as fast a possible
        # without any precomputing for all paths
        refds_path = Interface.get_refds_path(dataset)
        if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)):
            raise ValueError(
                "modification detection only works with a base dataset (non-given or found)")

        # prep common result props
        res_kwargs = dict(
            action=action if action else 'annotate_path',
            refds=refds_path,
            logger=lgr)

        # handle the case of recursion into a single dataset without any
        # extra fancy processing first -- full recursion can be done
        # faster than manual recursion, hence we gain quite some speed
        # from these few lines of extra code
        if not modified and not path and refds_path:
            if not GitRepo.is_valid_repo(refds_path):
                yield get_status_dict(
                    # doesn't matter if the path is in another dataset
                    # it was given as reference dataset
                    status=nondataset_path_status,
                    message='given reference dataset is not a dataset',
                    path=refds_path,
                    **res_kwargs)
                return

            refds = Dataset(refds_path)
            path = []
            # yield the dataset itself
            r = get_status_dict(ds=refds, status='', **res_kwargs)
            yield r

            if recursive:
                # if we have nothing given, but need recursion, we need to feed
                # the dataset path itself
                for r in yield_recursive(
                        refds,
                        refds_path,
                        action,
                        recursion_limit):
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    yield r
            return

        # goal: structure in a way that makes most information on any path
        # available in a single pass, at the cheapest possible cost
        reported_paths = {}
        requested_paths = assure_list(path)

        if modified is not None:
            # modification detection would silently kill all nondataset paths
            # but we have to complain about them, hence doing it here
            if requested_paths and refds_path:
                for r in requested_paths:
                    p = r['path'] if isinstance(r, dict) else r
                    p = resolve_path(p, ds=refds_path)
                    if path_startswith(p, refds_path):
                        # all good
                        continue
                    # not the refds
                    path_props = r if isinstance(r, dict) else {}
                    res = get_status_dict(
                        **dict(res_kwargs, **path_props))
                    res['status'] = nondataset_path_status
                    res['message'] = 'path not associated with reference dataset'
                    reported_paths[r] = res
                    yield res

            # preserve non-existing paths to be silently killed by modification
            # detection and append them to requested_paths again after detection.
            # TODO: This might be melted in with treatment of non dataset paths
            # above. Re-appending those paths seems to be better than yielding
            # directly to avoid code duplication, since both cases later on are
            # dealt with again.
            preserved_paths = []
            if requested_paths:
                [preserved_paths.append(r)
                 for r in requested_paths
                 if not lexists(r['path'] if isinstance(r, dict) else r)]

            # replace the requested paths by those paths that were actually
            # modified underneath or at a requested location
            requested_paths = get_modified_subpaths(
                # either the request, or the base dataset, if there was no request
                requested_paths if requested_paths else [refds_path],
                refds=Dataset(refds_path),
                revision=modified,
                report_no_revision_change=force_no_revision_change_discovery,
                report_untracked='all' if force_untracked_discovery else 'no',
                recursion_limit=recursion_limit)

            from itertools import chain
            # re-append the preserved paths:
            requested_paths = chain(requested_paths, iter(preserved_paths))

        # do not loop over unique(), this could be a list of dicts
        # we avoid duplicates manually below via `reported_paths`
        for path in requested_paths:
            if not isinstance(path, dict):
                path = rawpath2ap(path, refds_path)
            # this is now an annotated path!
            path_props = path
            path = path['path']
            # we need to mark our territory, who knows where this has been
            path_props.update(res_kwargs)

            if path in reported_paths:
                # we already recorded this path in the output
                # this can happen, whenever `path` is a subdataset, that was
                # discovered via recursive processing of another path before
                continue
            # the path exists in some shape or form
            # TODO if we have path_props already we could skip this test
            if isdir(path):
                # keep any existing type info, previously a more expensive run
                # could have discovered an uninstalled 'dataset', and we don't
                # want it to be relabeled to a directory
                path_props['type'] = \
                    path_props.get(
                        'type',
                        'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory')
                # this could contain all types of additional content
                containing_dir = path if not islink(path) else normpath(opj(path, pardir))
            else:
                if lexists(path):
                    path_props['type'] = 'file'
                else:
                    path_props['state'] = 'absent'
                # for everything else we are interested in the container
                containing_dir = dirname(path)
                if not containing_dir:
                    containing_dir = curdir

            dspath = parent = get_dataset_root(containing_dir)
            if dspath:
                if path_props.get('type', None) == 'dataset':
                    # for a dataset the root is not the parent, for anything else
                    # it is
                    parent = path_props.get('parentds', None)
                    oneupdir = normpath(opj(containing_dir, pardir))
                    if parent is None and (force_parentds_discovery or (
                            refds_path and _with_sep(oneupdir).startswith(
                                _with_sep(refds_path)))):
                        # either forced, or only if we have a reference dataset, and
                        # only if we stay within this refds when searching for the
                        # parent
                        parent = get_dataset_root(normpath(opj(containing_dir, pardir)))
                        # NOTE the `and refds_path` is critical, as it will determine
                        # whether a top-level dataset that was discovered gets the
                        # parent property or not, it won't get it without a common
                        # base dataset, and that is how we always rolled
                    if parent and refds_path:
                        path_props['parentds'] = parent
                        # don't check whether this is actually a true subdataset of the
                        # parent, done further down
                else:
                    # set parent, but prefer existing property
                    path_props['parentds'] = path_props.get('parentds', dspath)

            # test for `dspath` not `parent`, we only need to know whether there is
            # ANY dataset, not which one is the true parent, logic below relies on
            # the fact that we end here, if there is no dataset at all
            if not dspath:
                # not in any dataset
                res = get_status_dict(
                    **dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = 'path not associated with any dataset'
                reported_paths[path] = res
                yield res
                continue

            # check that we only got SUBdatasets
            if refds_path and not path_startswith(dspath, refds_path):
                res = get_status_dict(**dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = \
                    ('path not part of the reference dataset at %s', refds_path)
                reported_paths[path] = res
                yield res
                continue

            if path_props.get('type', None) == 'file':
                # nothing else we can learn about this
                res = get_status_dict(**dict(res_kwargs, **path_props))
                if 'status' not in res:
                    res['status'] = ''
                reported_paths[path] = res
                yield res
                continue

            containing_ds = None
            path_type = path_props.get('type', None)
            if parent and force_subds_discovery and (
                    (path_type == 'dataset' and 'registered_subds' not in path_props) or
                    path_type == 'directory' or
                    not lexists(path)):
                # if the path doesn't exist, or is labeled a directory, or a dataset even
                # a dataset (without this info) -> record whether this is a known subdataset
                # to its parent
                containing_ds = Dataset(parent)
                subdss = containing_ds.subdatasets(
                    fulfilled=None, recursive=False,
                    result_xfm=None, result_filter=None, return_type='list')
                if path in [s['path'] for s in subdss]:
                    if path_type == 'directory' or not lexists(path):
                        # first record that it isn't here, if just a dir or not here at all
                        path_props['state'] = 'absent'
                    # this must be a directory, and it is not installed
                    path_props['type'] = 'dataset'
                    path_props['registered_subds'] = True

            if not lexists(path) or \
                    (path_props.get('type', None) == 'dataset' and
                     path_props.get('state', None) == 'absent'):
                # not there (yet)
                message = unavailable_path_msg if unavailable_path_msg else None
                if message and '%s' in message:
                    message = (message, path)
                path_props['message'] = message
                res = get_status_dict(**dict(res_kwargs, **path_props))
                # assign given status, but only if the props don't indicate a status
                # already
                res['status'] = path_props.get(
                    'status', unavailable_path_status)
                reported_paths[path] = res
                yield res
                continue

            # we know everything we can, report
            res = get_status_dict(**dict(res_kwargs, **path_props))
            if 'status' not in res:
                res['status'] = ''
            reported_paths[path] = res
            yield res

            rec_paths = []
            if recursive:
                # here we need to consider the special case that `path` is
                # a dataset itself, if a recursion_limit is given (e.g.
                # `remove` will do that by default), we need to recurse
                # from the dataset itself, and not its parent to get things
                # right -- this will also avoid needless discovery of
                # unrelated subdatasets
                if path_props.get('type', None) == 'dataset':
                    containing_ds = Dataset(path)
                else:
                    # regular parent, we might have a dataset already
                    containing_ds = Dataset(parent) if containing_ds is None else containing_ds
                for r in yield_recursive(containing_ds, path, action, recursion_limit):
                    # capture reported paths
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    reported_paths[r['path']] = r
                    if modified is not None:
                        # we cannot yield right away, maybe it wasn't modified
                        rec_paths.append(r)
                    else:
                        yield r
            if modified is not None and rec_paths:
                # replace the recursively discovered paths by those paths that
                # were actually modified underneath or at a requested location
                for r in get_modified_subpaths(
                        rec_paths,
                        refds=Dataset(refds_path),
                        revision=modified,
                        report_no_revision_change=force_no_revision_change_discovery,
                        report_untracked='all' if force_untracked_discovery else 'no',
                        recursion_limit=recursion_limit):
                    res = get_status_dict(**dict(r, **res_kwargs))
                    reported_paths[res['path']] = res
                    yield res
        return