Example #1
0
def test_path_startswith():
    ok_(path_startswith('/a/b', '/a'))
    ok_(path_startswith('/a/b', '/a/b'))
    ok_(path_startswith('/a/b', '/a/b/'))
    ok_(path_startswith('/a/b/', '/a/b'))
    ok_(path_startswith('/a/b', '/'))
    ok_(path_startswith('/aaa/b/c', '/aaa'))
    nok_(path_startswith('/aaa/b/c', '/aa'))
    nok_(path_startswith('/a/b', '/a/c'))
    nok_(path_startswith('/a/b/c', '/a/c'))
    # must not mix relative and abs
    assert_raises(ValueError, path_startswith, 'a/b', '/a')
    assert_raises(ValueError, path_startswith, '/a/b', 'a')
Example #2
0
def _get_untracked_content(dspath, report_untracked, paths=None):
    cmd = [
        'git',
        '--work-tree=.',
        'status',
        '--porcelain',
        # file names NULL terminated
        '-z',
        # we never want to touch submodules, they cannot be untracked
        '--ignore-submodules=all',
        # fully untracked dirs as such, the rest as files
        '--untracked={}'.format(report_untracked)
    ]
    try:
        stdout, stderr = GitRunner(cwd=dspath).run(cmd,
                                                   log_stderr=True,
                                                   log_stdout=True,
                                                   log_online=False,
                                                   expect_stderr=False,
                                                   shell=False,
                                                   expect_fail=True)
    except CommandError as e:
        # TODO should we catch any and handle them in here?
        raise e

    if paths:
        paths = [r['path'] for r in paths]
        if len(paths) == 1 and paths[0] == dspath:
            # nothing to filter
            paths = None

    from datalad.utils import assure_unicode

    for line in stdout.split('\0'):
        if not line:
            continue
        line = assure_unicode(line)
        if not line.startswith('?? '):
            # nothing untracked, ignore, task of `diff`
            continue
        apath = opj(
            dspath,
            # strip state marker
            line[3:])
        norm_apath = normpath(apath)
        if paths and not any(norm_apath == p or path_startswith(apath, p)
                             for p in paths):
            # we got a whitelist for paths, don't report any other
            continue
        ap = dict(path=norm_apath,
                  parentds=dspath,
                  state='untracked',
                  type='directory' if isdir(apath) else 'file')
        yield ap
Example #3
0
def check_datasets_order(res, order='bottom-up'):
    """Check that all type=dataset records not violating the expected order

    it is somewhat weak test, i.e. records could be produced so we
    do not detect that order is violated, e.g. a/b c/d would satisfy
    either although they might be neither depth nor breadth wise.  But
    this test would allow to catch obvious violations like a, a/b, a
    """
    prev = None
    for r in res:
        if r.get('type') != 'dataset':
            continue
        if prev and r['path'] != prev:
            if order == 'bottom-up':
                assert_false(path_startswith(r['path'], prev))
            elif order == 'top-down':
                assert_false(path_startswith(prev, r['path']))
            else:
                raise ValueError(order)
        prev = r['path']
Example #4
0
def _get_untracked_content(dspath, report_untracked, paths=None):
    cmd = ['git', '--work-tree=.', 'status', '--porcelain',
           # file names NULL terminated
           '-z',
           # we never want to touch submodules, they cannot be untracked
           '--ignore-submodules=all',
           # fully untracked dirs as such, the rest as files
           '--untracked={}'.format(report_untracked)]
    try:
        stdout, stderr = GitRunner(cwd=dspath).run(
            cmd,
            log_stderr=True,
            log_stdout=True,
            log_online=False,
            expect_stderr=False,
            shell=False,
            expect_fail=True)
    except CommandError as e:
        # TODO should we catch any and handle them in here?
        raise e

    if paths:
        paths = [r['path'] for r in paths]
        if len(paths) == 1 and paths[0] == dspath:
            # nothing to filter
            paths = None

    from datalad.utils import assure_unicode

    for line in stdout.split('\0'):
        if not line:
            continue
        line = assure_unicode(line)
        if not line.startswith('?? '):
            # nothing untracked, ignore, task of `diff`
            continue
        apath = opj(
            dspath,
            # strip state marker
            line[3:])
        norm_apath = normpath(apath)
        if paths and not any(norm_apath == p or path_startswith(apath, p)
                             for p in paths):
            # we got a whitelist for paths, don't report any other
            continue
        ap = dict(
            path=norm_apath,
            parentds=dspath,
            state='untracked',
            type='directory' if isdir(apath) else 'file')
        yield ap
Example #5
0
def discover_dataset_trace_to_targets(basepath,
                                      targetpaths,
                                      current_trace,
                                      spec,
                                      includeds=None):
    """Discover the edges and nodes in a dataset tree to given target paths

    Parameters
    ----------
    basepath : path
      Path to a start or top-level dataset. Really has to be a path to a
      dataset!
    targetpaths : list(path)
      Any non-zero number of paths that are termination points for the
      search algorithm. Can be paths to datasets, directories, or files
      (and any combination thereof).
    current_trace : list
      For a top-level call this should probably always be `[]`
    spec : dict
      `content_by_ds`-style dictionary that will receive information about the
      discovered datasets. Specifically, for each discovered dataset there
      will be an item with its path under the key (path) of the respective
      superdataset.
    includeds : sequence, optional
      Any paths given are treated as existing subdatasets, regardless of
      whether they can be found in the filesystem. Such subdatasets will appear
      under the key of the closest existing dataset in the `spec`.

    Returns
    -------
    None
      Function calls itself recursively and populates `spec` dict in-place.
      Keys are dataset paths, values are sets of subdataset paths
    """
    # convert to set for faster lookup
    includeds = includeds if isinstance(includeds, set) else \
        set() if includeds is None else set(includeds)
    # this beast walks the directory tree from a given `basepath` until
    # it discovers any of the given `targetpaths`
    # if it finds one, it commits any accummulated trace of visited
    # datasets on this edge to the spec
    valid_repo = GitRepo.is_valid_repo(basepath)
    if valid_repo:
        # we are passing into a new dataset, extend the dataset trace
        current_trace = current_trace + [basepath]
    # this edge is not done, we need to try to reach any downstream
    # dataset
    undiscovered_ds = set(t for t in targetpaths)  # if t != basepath)
    # whether anything in this directory matched a targetpath
    filematch = False
    if isdir(basepath):
        for p in listdir(basepath):
            p = ensure_unicode(opj(basepath, p))
            if not isdir(p):
                if p in targetpaths:
                    filematch = True
                # we cannot have anything below this one
                continue
            # OPT listdir might be large and we could have only few items
            # in `targetpaths` -- so traverse only those in spec which have
            # leading dir basepath
            # filter targets matching this downward path
            downward_targets = set(t for t in targetpaths
                                   if path_startswith(t, p))
            if not downward_targets:
                continue
            # remove the matching ones from the "todo" list
            undiscovered_ds.difference_update(downward_targets)
            # go one deeper
            discover_dataset_trace_to_targets(
                p,
                downward_targets,
                current_trace,
                spec,
                includeds=includeds
                if not includeds else includeds.intersection(downward_targets))
    undiscovered_ds = [
        t for t in undiscovered_ds if includeds
        and path_is_subpath(t, current_trace[-1]) and t in includeds
    ]
    if filematch or basepath in targetpaths or undiscovered_ds:
        for i, p in enumerate(current_trace[:-1]):
            # TODO RF prepare proper annotated path dicts
            subds = spec.get(p, set())
            subds.add(current_trace[i + 1])
            spec[p] = subds
        if undiscovered_ds:
            spec[current_trace[-1]] = spec.get(current_trace[-1],
                                               set()).union(undiscovered_ds)
Example #6
0
def annotated2content_by_ds(annotated, refds_path):
    """Helper to convert annotated paths into an old-style content_by_ds dict

    Only items with an `status` property value not equal to 'ok', 'notneeded',
    'impossible', or 'error' are sorted. All others are considered as
    already processed and are returned in a separate list.

    Parameters
    ----------
    annotated : list or generator
      Dicts with annotated path information.
    refds_path : str
      Path to the reference dataset the original path annotation was based on.

    Returns
    -------
    dict, dict, list, list
      Dict keys are dataset paths, values are full info dicts.
      The keys in the second dict are paths to dataset, values are
      dicts with all known properties about those datasets.
      The first list contains all already "processed" results, which
      typically need to be re-yielded. The second list contains items (same
      type as dict values) for all annotated paths that have no associated
      parent dataset (i.e. nondataset paths) -- this list will be empty by
      default, unless `nondataset_path_status` was set to ''."""
    content_by_ds = OrderedDict()
    ds_props = {}
    nondataset_paths = []
    completed = []
    for r in annotated:
        r_path = r['path']
        if r.get('type', None) == 'dataset':
            # collect all properties of all known datasets from the annotated
            # paths
            dp = ds_props.get(r_path, {})
            dp.update(r)
            ds_props[r_path] = dp
        if r.get('status', None) in ('ok', 'notneeded', 'impossible', 'error'):
            completed.append(r)
            continue
        parentds = r.get('parentds', None)
        appendto = []  # what entries, if any, to append r to
        if r.get('type', None) == 'dataset':
            # do dataset handling first, it is the more complex beast
            orig_request = r.get('orig_request', None)
            if parentds is None or refds_path is None or \
                    r.get('process_content', False) or (orig_request and (
                    orig_request == curdir or
                    orig_request.endswith(dirsep) or
                    orig_request.endswith('{}{}'.format(dirsep, curdir)))):
                # a dataset that floats by on its own OR
                # behave similar to rsync, a trailing '/' indicates the
                # content rather then the dataset itself
                # in both cases we want to process this part as part
                # of the same dataset, and not any potential parent
                appendto += [r_path]
            if parentds and refds_path and \
                    path_startswith(parentds, refds_path):
                # put also in parentds record if there is any, and the parent
                # is underneath or identical to the reference dataset
                appendto += [parentds]
        else:
            # files and dirs
            # common case, something with a parentds
            appendto += [parentds]

        for e in appendto:
            if e not in content_by_ds:
                content_by_ds[e] = []
            content_by_ds[e] += [r]

    return content_by_ds, ds_props, completed, nondataset_paths
Example #7
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            action=None,
            unavailable_path_status='',
            unavailable_path_msg=None,
            nondataset_path_status='error',
            force_parentds_discovery=True,
            force_subds_discovery=True,
            force_no_revision_change_discovery=True,
            force_untracked_discovery=True,
            modified=None):
        # upfront check for the fastest possible response
        if not path and dataset is None:
            # nothing given, try "here", but do not use `require_dataset`, as
            # it will determine the root dataset of `curdir` and further down
            # lead to path annotation of upstairs directories
            dataset = curdir

        if force_subds_discovery and not force_parentds_discovery:
            raise ValueError(
                'subdataset discovery requires parent dataset discovery')

        # CONCEPT: yield with no status to indicate further processing

        # everything in one big loop to be able too yield as fast a possible
        # without any precomputing for all paths
        refds_path = Interface.get_refds_path(dataset)
        if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)):
            raise ValueError(
                "modification detection only works with a base dataset (non-given or found)")

        # prep common result props
        res_kwargs = dict(
            action=action if action else 'annotate_path',
            refds=refds_path,
            logger=lgr)

        # handle the case of recursion into a single dataset without any
        # extra fancy processing first -- full recursion can be done
        # faster than manual recursion, hence we gain quite some speed
        # from these few lines of extra code
        if not modified and not path and refds_path:
            if not GitRepo.is_valid_repo(refds_path):
                yield get_status_dict(
                    # doesn't matter if the path is in another dataset
                    # it was given as reference dataset
                    status=nondataset_path_status,
                    message='given reference dataset is not a dataset',
                    path=refds_path,
                    **res_kwargs)
                return

            refds = Dataset(refds_path)
            path = []
            # yield the dataset itself
            r = get_status_dict(ds=refds, status='', **res_kwargs)
            yield r

            if recursive:
                # if we have nothing given, but need recursion, we need to feed
                # the dataset path itself
                for r in yield_recursive(
                        refds,
                        refds_path,
                        action,
                        recursion_limit):
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    yield r
            return

        # goal: structure in a way that makes most information on any path
        # available in a single pass, at the cheapest possible cost
        reported_paths = {}
        requested_paths = assure_list(path)

        if modified is not None:
            # modification detection would silently kill all nondataset paths
            # but we have to complain about them, hence doing it here
            if requested_paths and refds_path:
                for r in requested_paths:
                    p = r['path'] if isinstance(r, dict) else r
                    p = resolve_path(p, ds=refds_path)
                    if path_startswith(p, refds_path):
                        # all good
                        continue
                    # not the refds
                    path_props = r if isinstance(r, dict) else {}
                    res = get_status_dict(
                        **dict(res_kwargs, **path_props))
                    res['status'] = nondataset_path_status
                    res['message'] = 'path not associated with reference dataset'
                    reported_paths[r] = res
                    yield res

            # preserve non-existing paths to be silently killed by modification
            # detection and append them to requested_paths again after detection.
            # TODO: This might be melted in with treatment of non dataset paths
            # above. Re-appending those paths seems to be better than yielding
            # directly to avoid code duplication, since both cases later on are
            # dealt with again.
            preserved_paths = []
            if requested_paths:
                [preserved_paths.append(r)
                 for r in requested_paths
                 if not lexists(r['path'] if isinstance(r, dict) else r)]

            # replace the requested paths by those paths that were actually
            # modified underneath or at a requested location
            requested_paths = get_modified_subpaths(
                # either the request, or the base dataset, if there was no request
                requested_paths if requested_paths else [refds_path],
                refds=Dataset(refds_path),
                revision=modified,
                report_no_revision_change=force_no_revision_change_discovery,
                report_untracked='all' if force_untracked_discovery else 'no',
                recursion_limit=recursion_limit)

            from itertools import chain
            # re-append the preserved paths:
            requested_paths = chain(requested_paths, iter(preserved_paths))

        # do not loop over unique(), this could be a list of dicts
        # we avoid duplicates manually below via `reported_paths`
        for path in requested_paths:
            if not isinstance(path, dict):
                path = rawpath2ap(path, refds_path)
            # this is now an annotated path!
            path_props = path
            path = path['path']
            # we need to mark our territory, who knows where this has been
            path_props.update(res_kwargs)

            if path in reported_paths:
                # we already recorded this path in the output
                # this can happen, whenever `path` is a subdataset, that was
                # discovered via recursive processing of another path before
                continue
            # the path exists in some shape or form
            # TODO if we have path_props already we could skip this test
            if isdir(path):
                # keep any existing type info, previously a more expensive run
                # could have discovered an uninstalled 'dataset', and we don't
                # want it to be relabeled to a directory
                path_props['type'] = \
                    path_props.get(
                        'type',
                        'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory')
                # this could contain all types of additional content
                containing_dir = path if not islink(path) else normpath(opj(path, pardir))
            else:
                if lexists(path):
                    path_props['type'] = 'file'
                else:
                    path_props['state'] = 'absent'
                # for everything else we are interested in the container
                containing_dir = dirname(path)
                if not containing_dir:
                    containing_dir = curdir

            dspath = parent = get_dataset_root(containing_dir)
            if dspath:
                if path_props.get('type', None) == 'dataset':
                    # for a dataset the root is not the parent, for anything else
                    # it is
                    parent = path_props.get('parentds', None)
                    oneupdir = normpath(opj(containing_dir, pardir))
                    if parent is None and (force_parentds_discovery or (
                            refds_path and _with_sep(oneupdir).startswith(
                                _with_sep(refds_path)))):
                        # either forced, or only if we have a reference dataset, and
                        # only if we stay within this refds when searching for the
                        # parent
                        parent = get_dataset_root(normpath(opj(containing_dir, pardir)))
                        # NOTE the `and refds_path` is critical, as it will determine
                        # whether a top-level dataset that was discovered gets the
                        # parent property or not, it won't get it without a common
                        # base dataset, and that is how we always rolled
                    if parent and refds_path:
                        path_props['parentds'] = parent
                        # don't check whether this is actually a true subdataset of the
                        # parent, done further down
                else:
                    # set parent, but prefer existing property
                    path_props['parentds'] = path_props.get('parentds', dspath)

            # test for `dspath` not `parent`, we only need to know whether there is
            # ANY dataset, not which one is the true parent, logic below relies on
            # the fact that we end here, if there is no dataset at all
            if not dspath:
                # not in any dataset
                res = get_status_dict(
                    **dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = 'path not associated with any dataset'
                reported_paths[path] = res
                yield res
                continue

            # check that we only got SUBdatasets
            if refds_path and not path_startswith(dspath, refds_path):
                res = get_status_dict(**dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = \
                    ('path not part of the reference dataset at %s', refds_path)
                reported_paths[path] = res
                yield res
                continue

            if path_props.get('type', None) == 'file':
                # nothing else we can learn about this
                res = get_status_dict(**dict(res_kwargs, **path_props))
                if 'status' not in res:
                    res['status'] = ''
                reported_paths[path] = res
                yield res
                continue

            containing_ds = None
            path_type = path_props.get('type', None)
            if parent and force_subds_discovery and (
                    (path_type == 'dataset' and 'registered_subds' not in path_props) or
                    path_type == 'directory' or
                    not lexists(path)):
                # if the path doesn't exist, or is labeled a directory, or a dataset even
                # a dataset (without this info) -> record whether this is a known subdataset
                # to its parent
                containing_ds = Dataset(parent)
                subdss = containing_ds.subdatasets(
                    fulfilled=None, recursive=False,
                    result_xfm=None, result_filter=None, return_type='list')
                if path in [s['path'] for s in subdss]:
                    if path_type == 'directory' or not lexists(path):
                        # first record that it isn't here, if just a dir or not here at all
                        path_props['state'] = 'absent'
                    # this must be a directory, and it is not installed
                    path_props['type'] = 'dataset'
                    path_props['registered_subds'] = True

            if not lexists(path) or \
                    (path_props.get('type', None) == 'dataset' and
                     path_props.get('state', None) == 'absent'):
                # not there (yet)
                message = unavailable_path_msg if unavailable_path_msg else None
                if message and '%s' in message:
                    message = (message, path)
                path_props['message'] = message
                res = get_status_dict(**dict(res_kwargs, **path_props))
                # assign given status, but only if the props don't indicate a status
                # already
                res['status'] = path_props.get(
                    'status', unavailable_path_status)
                reported_paths[path] = res
                yield res
                continue

            # we know everything we can, report
            res = get_status_dict(**dict(res_kwargs, **path_props))
            if 'status' not in res:
                res['status'] = ''
            reported_paths[path] = res
            yield res

            rec_paths = []
            if recursive:
                # here we need to consider the special case that `path` is
                # a dataset itself, if a recursion_limit is given (e.g.
                # `remove` will do that by default), we need to recurse
                # from the dataset itself, and not its parent to get things
                # right -- this will also avoid needless discovery of
                # unrelated subdatasets
                if path_props.get('type', None) == 'dataset':
                    containing_ds = Dataset(path)
                else:
                    # regular parent, we might have a dataset already
                    containing_ds = Dataset(parent) if containing_ds is None else containing_ds
                for r in yield_recursive(containing_ds, path, action, recursion_limit):
                    # capture reported paths
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    reported_paths[r['path']] = r
                    if modified is not None:
                        # we cannot yield right away, maybe it wasn't modified
                        rec_paths.append(r)
                    else:
                        yield r
            if modified is not None and rec_paths:
                # replace the recursively discovered paths by those paths that
                # were actually modified underneath or at a requested location
                for r in get_modified_subpaths(
                        rec_paths,
                        refds=Dataset(refds_path),
                        revision=modified,
                        report_no_revision_change=force_no_revision_change_discovery,
                        report_untracked='all' if force_untracked_discovery else 'no',
                        recursion_limit=recursion_limit):
                    res = get_status_dict(**dict(r, **res_kwargs))
                    reported_paths[res['path']] = res
                    yield res
        return
Example #8
0
def _query_aggregated_metadata_singlepath(
        ds, agginfos, agg_base_path, qap, reporton, cache, dsmeta,
        contentinfo_objloc):
    """This is the workhorse of query_aggregated_metadata() for querying for a
    single path"""
    rpath = qap['rpath']
    containing_ds = qap['metaprovider']
    qtype = qap.get('type', None)
    if (rpath == op.curdir or rpath == containing_ds) and \
            ((reporton is None and qtype == 'dataset') or \
             reporton in ('datasets', 'all')):
        # this is a direct match for a dataset (we only have agginfos for
        # datasets) -> prep result
        res = get_status_dict(
            status='ok',
            metadata=dsmeta,
            # normpath to avoid trailing dot
            path=op.normpath(op.join(ds.path, rpath)),
            type='dataset')
        # all info on the dataset is gathered -> eject
        yield res

    if (reporton is None and qtype != 'file') or reporton not in (None, 'files', 'all'):
        return

    #
    # everything that follows is about content metadata
    #
    # content info dicts have metadata stored under paths that are relative
    # to the dataset they were aggregated from
    rparentpath = op.relpath(rpath, start=containing_ds)

    # so we have some files to query, and we also have some content metadata
    contentmeta = _load_xz_json_stream(
        op.join(agg_base_path, contentinfo_objloc),
        cache=cache['objcache']) if contentinfo_objloc else {}

    for fpath in [f for f in contentmeta.keys()
                  if rparentpath == op.curdir or
                  path_startswith(f, rparentpath)]:
        # we might be onto something here, prepare result
        metadata = contentmeta.get(fpath, {})

        # we have to pull out the context for each extractor from the dataset
        # metadata
        for tlk in metadata:
            if tlk.startswith('@'):
                continue
            context = dsmeta.get(tlk, {}).get('@context', None)
            if context is None:
                continue
            metadata[tlk]['@context'] = context
        if '@context' in dsmeta:
            metadata['@context'] = dsmeta['@context']

        res = get_status_dict(
            status='ok',
            # the specific match within the containing dataset
            # normpath() because containing_ds could be `op.curdir`
            path=op.normpath(op.join(ds.path, containing_ds, fpath)),
            # we can only match files
            type='file',
            metadata=metadata)
        yield res
Example #9
0
def _get_submodules(dspath, fulfilled, recursive, recursion_limit, contains,
                    bottomup, set_property, delete_property, refds_path):
    if not GitRepo.is_valid_repo(dspath):
        return
    modinfo = _parse_gitmodules(dspath)
    # write access parser
    parser = None
    # TODO bring back in more global scope from below once segfaults are
    # figured out
    #if set_property or delete_property:
    #    gitmodule_path = opj(dspath, ".gitmodules")
    #    parser = GitConfigParser(
    #        gitmodule_path, read_only=False, merge_includes=False)
    #    parser.read()
    # put in giant for-loop to be able to yield results before completion
    for sm in _parse_git_submodules(dspath):
        if contains and not path_startswith(contains, sm['path']):
            # we are not looking for this subds, because it doesn't
            # match the target path
            continue
        sm.update(modinfo.get(sm['path'], {}))
        if set_property or delete_property:
            gitmodule_path = opj(dspath, ".gitmodules")
            parser = GitConfigParser(gitmodule_path,
                                     read_only=False,
                                     merge_includes=False)
            parser.read()
            # do modifications now before we read the info out for reporting
            # use 'submodule "NAME"' section ID style as this seems to be the default
            submodule_section = 'submodule "{}"'.format(sm['gitmodule_name'])
            # first deletions
            for dprop in assure_list(delete_property):
                parser.remove_option(submodule_section, dprop)
                # also kick from the info we just read above
                sm.pop('gitmodule_{}'.format(dprop), None)
            # and now setting values
            for sprop in assure_list(set_property):
                prop, val = sprop
                if val.startswith('<') and val.endswith('>') and '{' in val:
                    # expand template string
                    val = val[1:-1].format(
                        **dict(sm,
                               refds_relpath=relpath(sm['path'], refds_path),
                               refds_relname=relpath(sm['path'], refds_path).
                               replace(os.sep, '-')))
                parser.set_value(submodule_section, prop, val)
                # also add to the info we just read above
                sm['gitmodule_{}'.format(prop)] = val
            Dataset(dspath).add(
                '.gitmodules',
                to_git=True,
                message='[DATALAD] modified subdataset properties')
            # let go of resources, locks, ...
            parser.release()

        #common = commonprefix((with_pathsep(subds), with_pathsep(path)))
        #if common.endswith(sep) and common == with_pathsep(subds):
        #    candidates.append(common)
        subdsres = get_status_dict('subdataset',
                                   status='ok',
                                   type='dataset',
                                   logger=lgr)
        subdsres.update(sm)
        subdsres['parentds'] = dspath
        if not bottomup and \
                (fulfilled is None or
                 GitRepo.is_valid_repo(sm['path']) == fulfilled):
            yield subdsres

        # expand list with child submodules. keep all paths relative to parent
        # and convert jointly at the end
        if recursive and \
                (recursion_limit in (None, 'existing') or
                 (isinstance(recursion_limit, int) and
                  recursion_limit > 1)):
            for r in _get_submodules(
                    sm['path'], fulfilled, recursive,
                (recursion_limit -
                 1) if isinstance(recursion_limit, int) else recursion_limit,
                    contains, bottomup, set_property, delete_property,
                    refds_path):
                yield r
        if bottomup and \
                (fulfilled is None or
                 GitRepo.is_valid_repo(sm['path']) == fulfilled):
            yield subdsres
    if parser is not None:
        # release parser lock manually, auto-cleanup is not reliable in PY3
        parser.release()
def annotated2content_by_ds(annotated, refds_path):
    """Helper to convert annotated paths into an old-style content_by_ds dict

    Only items with an `status` property value not equal to 'ok', 'notneeded',
    'impossible', or 'error' are sorted. All others are considered as
    already processed and are returned in a separate list.

    Parameters
    ----------
    annotated : list or generator
      Dicts with annotated path information.
    refds_path : str
      Path to the reference dataset the original path annotation was based on.

    Returns
    -------
    dict, dict, list, list
      Dict keys are dataset paths, values are full info dicts.
      The keys in the second dict are paths to dataset, values are
      dicts with all known properties about those datasets.
      The first list contains all already "processed" results, which
      typically need to be re-yielded. The second list contains items (same
      type as dict values) for all annotated paths that have no associated
      parent dataset (i.e. nondataset paths) -- this list will be empty by
      default, unless `nondataset_path_status` was set to ''."""
    content_by_ds = OrderedDict()
    ds_props = {}
    nondataset_paths = []
    completed = []
    for r in annotated:
        r_path = r['path']
        if r.get('type', None) == 'dataset':
            # collect all properties of all known datasets from the annotated
            # paths
            dp = ds_props.get(r_path, {})
            dp.update(r)
            ds_props[r_path] = dp
        if r.get('status', None) in ('ok', 'notneeded', 'impossible', 'error'):
            completed.append(r)
            continue
        parentds = r.get('parentds', None)
        appendto = []  # what entries, if any, to append r to
        if r.get('type', None) == 'dataset':
            # do dataset handling first, it is the more complex beast
            orig_request = r.get('orig_request', None)
            if parentds is None or refds_path is None or \
                    r.get('process_content', False) or (orig_request and (
                    orig_request == curdir or
                    orig_request.endswith(dirsep) or
                    orig_request.endswith('{}{}'.format(dirsep, curdir)))):
                # a dataset that floats by on its own OR
                # behave similar to rsync, a trailing '/' indicates the
                # content rather then the dataset itself
                # in both cases we want to process this part as part
                # of the same dataset, and not any potential parent
                appendto += [r_path]
            if parentds and refds_path and \
                    path_startswith(parentds, refds_path):
                # put also in parentds record if there is any, and the parent
                # is underneath or identical to the reference dataset
                appendto += [parentds]
        else:
            # files and dirs
            # common case, something with a parentds
            appendto += [parentds]

        for e in appendto:
            if e not in content_by_ds:
                content_by_ds[e] = []
            content_by_ds[e] += [r]

    return content_by_ds, ds_props, completed, nondataset_paths
    def __call__(path=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 action=None,
                 unavailable_path_status='',
                 unavailable_path_msg=None,
                 nondataset_path_status='error',
                 force_parentds_discovery=True,
                 force_subds_discovery=True,
                 force_no_revision_change_discovery=True,
                 force_untracked_discovery=True,
                 modified=None):
        # upfront check for the fastest possible response
        if not path and dataset is None:
            # nothing given, try "here", but do not use `require_dataset`, as
            # it will determine the root dataset of `curdir` and further down
            # lead to path annotation of upstairs directories
            dataset = curdir

        if force_subds_discovery and not force_parentds_discovery:
            raise ValueError(
                'subdataset discovery requires parent dataset discovery')

        # CONCEPT: yield with no status to indicate further processing

        # everything in one big loop to be able too yield as fast a possible
        # without any precomputing for all paths
        refds_path = Interface.get_refds_path(dataset)
        if modified is not None and (refds_path is None
                                     or not GitRepo.is_valid_repo(refds_path)):
            raise ValueError(
                "modification detection only works with a base dataset (non-given or found)"
            )

        # prep common result props
        res_kwargs = dict(action=action if action else 'annotate_path',
                          refds=refds_path,
                          logger=lgr)

        # handle the case of recursion into a single dataset without any
        # extra fancy processing first -- full recursion can be done
        # faster than manual recursion, hence we gain quite some speed
        # from these few lines of extra code
        if not modified and not path and refds_path:
            if not GitRepo.is_valid_repo(refds_path):
                yield get_status_dict(
                    # doesn't matter if the path is in another dataset
                    # it was given as reference dataset
                    status=nondataset_path_status,
                    message='given reference dataset is not a dataset',
                    path=refds_path,
                    **res_kwargs)
                return

            refds = Dataset(refds_path)
            path = []
            # yield the dataset itself
            r = get_status_dict(ds=refds, status='', **res_kwargs)
            yield r

            if recursive:
                # if we have nothing given, but need recursion, we need to feed
                # the dataset path itself
                for r in yield_recursive(refds, refds_path, action,
                                         recursion_limit):
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    yield r
            return

        # goal: structure in a way that makes most information on any path
        # available in a single pass, at the cheapest possible cost
        reported_paths = {}
        requested_paths = ensure_list(path)

        if modified is not None:
            # modification detection would silently kill all nondataset paths
            # but we have to complain about them, hence doing it here
            if requested_paths and refds_path:
                for r in requested_paths:
                    p = r['path'] if isinstance(r, dict) else r
                    p = _resolve_path(p, ds=refds_path)
                    if path_startswith(p, refds_path):
                        # all good
                        continue
                    # not the refds
                    path_props = r if isinstance(r, dict) else {}
                    res = get_status_dict(**dict(res_kwargs, **path_props))
                    res['status'] = nondataset_path_status
                    res['message'] = 'path not associated with reference dataset'
                    reported_paths[r] = res
                    yield res

            # preserve non-existing paths to be silently killed by modification
            # detection and append them to requested_paths again after detection.
            # TODO: This might be melted in with treatment of non dataset paths
            # above. Re-appending those paths seems to be better than yielding
            # directly to avoid code duplication, since both cases later on are
            # dealt with again.
            preserved_paths = []
            if requested_paths:
                [
                    preserved_paths.append(r) for r in requested_paths
                    if not lexists(r['path'] if isinstance(r, dict) else r)
                ]

            # replace the requested paths by those paths that were actually
            # modified underneath or at a requested location
            requested_paths = get_modified_subpaths(
                # either the request, or the base dataset, if there was no request
                requested_paths if requested_paths else [refds_path],
                refds=Dataset(refds_path),
                revision=modified,
                report_no_revision_change=force_no_revision_change_discovery,
                report_untracked='all' if force_untracked_discovery else 'no',
                recursion_limit=recursion_limit)

            from itertools import chain
            # re-append the preserved paths:
            requested_paths = chain(requested_paths, iter(preserved_paths))

        # Possibly to be used "cache" of known subdatasets per each parent
        # to avoid re-querying subdatasets per each path.  The assumption here
        # is that the list of sub-datasets for a given parent should not change
        # through the execution of this loop, which (hypothetically) could be
        # incorrect while annotating paths for some commands.
        # TODO: verify this assumption and possibly add an argument to turn
        #  caching off if/when needed, or provide some other way to invalidate
        #  it
        subdss_cache = {}

        # do not loop over unique(), this could be a list of dicts
        # we avoid duplicates manually below via `reported_paths`
        for path in requested_paths:
            if not isinstance(path, dict):
                path = rawpath2ap(path, refds_path)
            # this is now an annotated path!
            path_props = path
            path = path['path']
            # we need to mark our territory, who knows where this has been
            path_props.update(res_kwargs)

            if path in reported_paths:
                # we already recorded this path in the output
                # this can happen, whenever `path` is a subdataset, that was
                # discovered via recursive processing of another path before
                continue
            # the path exists in some shape or form
            # TODO if we have path_props already we could skip this test
            if isdir(path):
                # keep any existing type info, previously a more expensive run
                # could have discovered an uninstalled 'dataset', and we don't
                # want it to be relabeled to a directory
                path_props['type'] = \
                    path_props.get(
                        'type',
                        'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory')
                # this could contain all types of additional content
                containing_dir = path if not islink(path) else normpath(
                    opj(path, pardir))
            else:
                if lexists(path):
                    path_props['type'] = 'file'
                else:
                    path_props['state'] = 'absent'
                # for everything else we are interested in the container
                containing_dir = dirname(path)
                if not containing_dir:
                    containing_dir = curdir

            dspath = parent = get_dataset_root(containing_dir)
            if dspath:
                if path_props.get('type', None) == 'dataset':
                    # for a dataset the root is not the parent, for anything else
                    # it is
                    parent = path_props.get('parentds', None)
                    oneupdir = normpath(opj(containing_dir, pardir))
                    if parent is None and (force_parentds_discovery or
                                           (refds_path
                                            and _with_sep(oneupdir).startswith(
                                                _with_sep(refds_path)))):
                        # either forced, or only if we have a reference dataset, and
                        # only if we stay within this refds when searching for the
                        # parent
                        parent = get_dataset_root(
                            normpath(opj(containing_dir, pardir)))
                        # NOTE the `and refds_path` is critical, as it will determine
                        # whether a top-level dataset that was discovered gets the
                        # parent property or not, it won't get it without a common
                        # base dataset, and that is how we always rolled
                    if parent and refds_path:
                        path_props['parentds'] = parent
                        # don't check whether this is actually a true subdataset of the
                        # parent, done further down
                else:
                    # set parent, but prefer existing property
                    path_props['parentds'] = path_props.get('parentds', dspath)

            # test for `dspath` not `parent`, we only need to know whether there is
            # ANY dataset, not which one is the true parent, logic below relies on
            # the fact that we end here, if there is no dataset at all
            if not dspath:
                # not in any dataset
                res = get_status_dict(**dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = 'path not associated with any dataset'
                reported_paths[path] = res
                yield res
                continue

            # check that we only got SUBdatasets
            if refds_path and not path_startswith(dspath, refds_path):
                res = get_status_dict(**dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = \
                    ('path not part of the reference dataset at %s', refds_path)
                reported_paths[path] = res
                yield res
                continue

            if path_props.get('type', None) == 'file':
                # nothing else we can learn about this
                res = get_status_dict(**dict(res_kwargs, **path_props))
                if 'status' not in res:
                    res['status'] = ''
                reported_paths[path] = res
                yield res
                continue

            containing_ds = None
            path_type = path_props.get('type', None)
            if parent and force_subds_discovery and (
                (path_type == 'dataset'
                 and 'registered_subds' not in path_props)
                    or path_type == 'directory' or not lexists(path)):
                # if the path doesn't exist, or is labeled a directory, or a dataset even
                # a dataset (without this info) -> record whether this is a known subdataset
                # to its parent
                containing_ds = Dataset(parent)
                # Possibly "cache" the list of known subdss for parents we
                # have encountered so far
                if parent in subdss_cache:
                    subdss = subdss_cache[parent]
                else:
                    subdss = containing_ds.subdatasets(fulfilled=None,
                                                       recursive=False,
                                                       result_xfm=None,
                                                       result_filter=None,
                                                       return_type='list')
                    subdss_cache[parent] = subdss
                if path in [s['path'] for s in subdss]:
                    if path_type == 'directory' or not lexists(path):
                        # first record that it isn't here, if just a dir or not here at all
                        path_props['state'] = 'absent'
                    # this must be a directory, and it is not installed
                    path_props['type'] = 'dataset'
                    path_props['registered_subds'] = True

            if not lexists(path) or \
                    (path_props.get('type', None) == 'dataset' and
                     path_props.get('state', None) == 'absent'):
                # not there (yet)
                message = unavailable_path_msg if unavailable_path_msg else None
                if message and '%s' in message:
                    message = (message, path)
                path_props['message'] = message
                res = get_status_dict(**dict(res_kwargs, **path_props))
                # assign given status, but only if the props don't indicate a status
                # already
                res['status'] = path_props.get('status',
                                               unavailable_path_status)
                reported_paths[path] = res
                yield res
                continue

            # we know everything we can, report
            res = get_status_dict(**dict(res_kwargs, **path_props))
            if 'status' not in res:
                res['status'] = ''
            reported_paths[path] = res
            yield res

            rec_paths = []
            if recursive:
                # here we need to consider the special case that `path` is
                # a dataset itself, if a recursion_limit is given (e.g.
                # `remove` will do that by default), we need to recurse
                # from the dataset itself, and not its parent to get things
                # right -- this will also avoid needless discovery of
                # unrelated subdatasets
                if path_props.get('type', None) == 'dataset':
                    containing_ds = Dataset(path)
                else:
                    # regular parent, we might have a dataset already
                    containing_ds = Dataset(
                        parent) if containing_ds is None else containing_ds
                for r in yield_recursive(containing_ds, path, action,
                                         recursion_limit):
                    # capture reported paths
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    reported_paths[r['path']] = r
                    if modified is not None:
                        # we cannot yield right away, maybe it wasn't modified
                        rec_paths.append(r)
                    else:
                        yield r
            if modified is not None and rec_paths:
                # replace the recursively discovered paths by those paths that
                # were actually modified underneath or at a requested location
                for r in get_modified_subpaths(
                        rec_paths,
                        refds=Dataset(refds_path),
                        revision=modified,
                        report_no_revision_change=
                        force_no_revision_change_discovery,
                        report_untracked='all'
                        if force_untracked_discovery else 'no',
                        recursion_limit=recursion_limit):
                    res = get_status_dict(**dict(r, **res_kwargs))
                    reported_paths[res['path']] = res
                    yield res
        return
Example #12
0
def _get_submodules(dspath, fulfilled, recursive, recursion_limit,
                    contains, bottomup, set_property, delete_property,
                    refds_path):
    if not GitRepo.is_valid_repo(dspath):
        return
    modinfo = _parse_gitmodules(dspath)
    # write access parser
    parser = None
    # TODO bring back in more global scope from below once segfaults are
    # figured out
    #if set_property or delete_property:
    #    gitmodule_path = opj(dspath, ".gitmodules")
    #    parser = GitConfigParser(
    #        gitmodule_path, read_only=False, merge_includes=False)
    #    parser.read()
    # put in giant for-loop to be able to yield results before completion
    for sm in _parse_git_submodules(dspath):
        if contains and not path_startswith(contains, sm['path']):
            # we are not looking for this subds, because it doesn't
            # match the target path
            continue
        sm.update(modinfo.get(sm['path'], {}))
        if set_property or delete_property:
            gitmodule_path = opj(dspath, ".gitmodules")
            parser = GitConfigParser(
                gitmodule_path, read_only=False, merge_includes=False)
            parser.read()
            # do modifications now before we read the info out for reporting
            # use 'submodule "NAME"' section ID style as this seems to be the default
            submodule_section = 'submodule "{}"'.format(sm['gitmodule_name'])
            # first deletions
            for dprop in assure_list(delete_property):
                parser.remove_option(submodule_section, dprop)
                # also kick from the info we just read above
                sm.pop('gitmodule_{}'.format(dprop), None)
            # and now setting values
            for sprop in assure_list(set_property):
                prop, val = sprop
                if val.startswith('<') and val.endswith('>') and '{' in val:
                    # expand template string
                    val = val[1:-1].format(
                        **dict(
                            sm,
                            refds_relpath=relpath(sm['path'], refds_path),
                            refds_relname=relpath(sm['path'], refds_path).replace(os.sep, '-')))
                parser.set_value(
                    submodule_section,
                    prop,
                    val)
                # also add to the info we just read above
                sm['gitmodule_{}'.format(prop)] = val
            Dataset(dspath).add(
                '.gitmodules', to_git=True,
                message='[DATALAD] modified subdataset properties')
            # let go of resources, locks, ...
            parser.release()

        #common = commonprefix((with_pathsep(subds), with_pathsep(path)))
        #if common.endswith(sep) and common == with_pathsep(subds):
        #    candidates.append(common)
        subdsres = get_status_dict(
            'subdataset',
            status='ok',
            type='dataset',
            logger=lgr)
        subdsres.update(sm)
        subdsres['parentds'] = dspath
        if not bottomup and \
                (fulfilled is None or
                 GitRepo.is_valid_repo(sm['path']) == fulfilled):
            yield subdsres

        # expand list with child submodules. keep all paths relative to parent
        # and convert jointly at the end
        if recursive and \
                (recursion_limit in (None, 'existing') or
                 (isinstance(recursion_limit, int) and
                  recursion_limit > 1)):
            for r in _get_submodules(
                    sm['path'],
                    fulfilled, recursive,
                    (recursion_limit - 1)
                    if isinstance(recursion_limit, int)
                    else recursion_limit,
                    contains,
                    bottomup,
                    set_property,
                    delete_property,
                    refds_path):
                yield r
        if bottomup and \
                (fulfilled is None or
                 GitRepo.is_valid_repo(sm['path']) == fulfilled):
            yield subdsres
    if parser is not None:
        # release parser lock manually, auto-cleanup is not reliable in PY3
        parser.release()
Example #13
0
    def __call__(
            path=None,
            force=False,
            description=None,
            dataset=None,
            no_annex=False,
            save=True,
            annex_version=None,
            annex_backend='MD5E',
            native_metadata_type=None,
            shared_access=None,
            git_opts=None,
            annex_opts=None,
            annex_init_opts=None,
            text_no_annex=None
    ):

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD
        if path and dataset:
            # Given a path and a dataset (path) not pointing to installed
            # dataset
            if not dataset.is_installed():
                msg = "No installed dataset at %s found." % dataset.path
                dsroot = get_dataset_root(dataset.path)
                if dsroot:
                    msg += " If you meant to add to the %s dataset, use that path " \
                           "instead but remember that if dataset is provided, " \
                           "relative paths are relative to the top of the " \
                           "dataset." % dsroot
                raise ValueError(msg)

        # sanity check first
        if git_opts:
            lgr.warning(
                "`git_opts` argument is presently ignored, please complain!")
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")
            if annex_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex and declaring no "
                                 "annex repo.")
            if annex_init_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex init and declaring no "
                                 "annex repo.")

        if not isinstance(force, bool):
            raise ValueError("force should be bool, got %r.  Did you mean to provide a 'path'?" % force)
        annotated_paths = AnnotatePaths.__call__(
            # nothing given explicitly, assume create fresh right here
            path=path if path else getpwd() if dataset is None else None,
            dataset=dataset,
            recursive=False,
            action='create',
            # we need to know whether we have to check for potential
            # subdataset collision
            force_parentds_discovery=True,
            # it is absolutely OK to have something that does not exist
            unavailable_path_status='',
            unavailable_path_msg=None,
            # if we have a dataset given that actually exists, we want to
            # fail if the requested path is not in it
            nondataset_path_status='error' \
                if isinstance(dataset, Dataset) and dataset.is_installed() else '',
            on_failure='ignore')
        path = None
        for r in annotated_paths:
            if r['status']:
                # this is dealt with already
                yield r
                continue
            if path is not None:
                raise ValueError("`create` can only handle single target path or dataset")
            path = r

        if len(annotated_paths) and path is None:
            # we got something, we complained already, done
            return

        # we know that we need to create a dataset at `path`
        assert(path is not None)

        # prep for yield
        path.update({'logger': lgr, 'type': 'dataset'})
        # just discard, we have a new story to tell
        path.pop('message', None)
        if 'parentds' in path:
            try:
                subds = next(
                    sds
                    for sds in Subdatasets.__call__(
                        dataset=path['parentds'],
                        # any known
                        fulfilled=None,
                        recursive=False,
                        result_xfm='paths')
                    if path_startswith(path['path'], sds)
                )
                path.update({
                    'status': 'error',
                    'message': ('collision with known subdataset %s/ in dataset %s',
                                relpath(subds, path['parentds']), path['parentds'])})
                yield path
                return
            except StopIteration:
                # all good
                pass

        if git_opts is None:
            git_opts = {}
        if shared_access:
            # configure `git --shared` value
            git_opts['shared'] = shared_access

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = dataset if isinstance(dataset, Dataset) and dataset.path == path['path'] \
            else Dataset(path['path'])

        # don't create in non-empty directory without `force`:
        if isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            path.update({
                'status': 'error',
                'message':
                    'will not create a dataset in a non-empty directory, use '
                    '`force` option to ignore'})
            yield path
            return

        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            GitRepo(
                tbds.path,
                url=None,
                create=True,
                git_opts=git_opts)
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            tbrepo = AnnexRepo(
                tbds.path,
                url=None,
                create=True,
                backend=annex_backend,
                version=annex_version,
                description=description,
                git_opts=git_opts,
                annex_opts=annex_opts,
                annex_init_opts=annex_init_opts
            )

            if text_no_annex:
                git_attributes_file = opj(tbds.path, '.gitattributes')
                with open(git_attributes_file, 'a') as f:
                    f.write('* annex.largefiles=(not(mimetype=text/*))\n')
                tbrepo.add([git_attributes_file], git=True)
                tbrepo.commit(
                    "Instructed annex to add text files to git",
                    _datalad_msg=True,
                    files=[git_attributes_file]
                )

        if native_metadata_type is not None:
            if not isinstance(native_metadata_type, list):
                native_metadata_type = [native_metadata_type]
            for nt in native_metadata_type:
                tbds.config.add('datalad.metadata.nativetype', nt)

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        if id_var in tbds.config:
            # make sure we reset this variable completely, in case of a re-create
            tbds.config.unset(id_var, where='dataset')
        tbds.config.add(
            id_var,
            tbds.id if tbds.id is not None else uuid.uuid1().urn.split(':')[-1],
            where='dataset')

        # make sure that v6 annex repos never commit content under .datalad
        with open(opj(tbds.path, '.datalad', '.gitattributes'), 'a') as gitattr:
            # TODO this will need adjusting, when annex'ed aggregate meta data
            # comes around
            gitattr.write('# Text files (according to file --mime-type) are added directly to git.\n')
            gitattr.write('# See http://git-annex.branchable.com/tips/largefiles/ for more info.\n')
            gitattr.write('** annex.largefiles=nothing\n')

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbds.add('.datalad', to_git=True, save=save,
                 message='[DATALAD] new dataset')

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if save and isinstance(dataset, Dataset) and dataset.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            for r in dataset.add(
                    tbds.path,
                    save=True,
                    return_type='generator',
                    result_filter=None,
                    result_xfm=None,
                    on_failure='ignore'):
                yield r

        path.update({'status': 'ok'})
        yield path
Example #14
0
def discover_dataset_trace_to_targets(basepath, targetpaths, current_trace,
                                      spec, includeds=None):
    """Discover the edges and nodes in a dataset tree to given target paths

    Parameters
    ----------
    basepath : path
      Path to a start or top-level dataset. Really has to be a path to a
      dataset!
    targetpaths : list(path)
      Any non-zero number of paths that are termination points for the
      search algorithm. Can be paths to datasets, directories, or files
      (and any combination thereof).
    current_trace : list
      For a top-level call this should probably always be `[]`
    spec : dict
      `content_by_ds`-style dictionary that will receive information about the
      discovered datasets. Specifically, for each discovered dataset there
      will be in item with its path under the key (path) of the respective
      superdataset.
    includeds : sequence, optional
      Any paths given are treated as existing subdatasets, regardless of
      whether they can be found in the filesystem. Such subdatasets will appear
      under the key of the closest existing dataset in the `spec`.

    Returns
    -------
    None
      Function calls itself recursively and populates `spec` dict in-place.
      Keys are dataset paths, values are sets of subdataset paths
    """
    # convert to set for faster lookup
    includeds = includeds if isinstance(includeds, set) else \
        set() if includeds is None else set(includeds)
    # this beast walks the directory tree from a given `basepath` until
    # it discovers any of the given `targetpaths`
    # if it finds one, it commits any accummulated trace of visited
    # datasets on this edge to the spec
    valid_repo = GitRepo.is_valid_repo(basepath)
    if valid_repo:
        # we are passing into a new dataset, extend the dataset trace
        current_trace = current_trace + [basepath]
    # this edge is not done, we need to try to reach any downstream
    # dataset
    undiscovered_ds = set(t for t in targetpaths) # if t != basepath)
    # whether anything in this directory matched a targetpath
    filematch = False
    if isdir(basepath):
        for p in listdir(basepath):
            p = assure_unicode(opj(basepath, p))
            if not isdir(p):
                if p in targetpaths:
                    filematch = True
                # we cannot have anything below this one
                continue
            # OPT listdir might be large and we could have only few items
            # in `targetpaths` -- so traverse only those in spec which have
            # leading dir basepath
            # filter targets matching this downward path
            downward_targets = set(
                t for t in targetpaths if path_startswith(t, p))
            if not downward_targets:
                continue
            # remove the matching ones from the "todo" list
            undiscovered_ds.difference_update(downward_targets)
            # go one deeper
            discover_dataset_trace_to_targets(
                p, downward_targets, current_trace, spec,
                includeds=includeds if not includeds else includeds.intersection(
                    downward_targets))
    undiscovered_ds = [t for t in undiscovered_ds
                       if includeds and
                          path_is_subpath(t, current_trace[-1]) and
                          t in includeds]
    if filematch or basepath in targetpaths or undiscovered_ds:
        for i, p in enumerate(current_trace[:-1]):
            # TODO RF prepare proper annotated path dicts
            subds = spec.get(p, set())
            subds.add(current_trace[i + 1])
            spec[p] = subds
        if undiscovered_ds:
            spec[current_trace[-1]] = spec.get(current_trace[-1], set()).union(
                undiscovered_ds)
Example #15
0
def _get_metadatarelevant_paths(ds, subds_relpaths):
    return (f for f in ds.repo.get_files() if not any(
        path_startswith(f, ex)
        for ex in list(exclude_from_metadata) + subds_relpaths))
Example #16
0
def _query_aggregated_metadata_singlepath(ds, agginfos, agg_base_path, qap,
                                          reporton, cache, dsmeta,
                                          contentinfo_objloc):
    """This is the workhorse of query_aggregated_metadata() for querying for a
    single path"""
    rpath = qap['rpath']
    containing_ds = qap['metaprovider']
    qtype = qap.get('type', None)
    if (rpath == curdir or rpath == containing_ds) and \
            ((reporton is None and qtype == 'dataset') or \
             reporton in ('datasets', 'all')):
        # this is a direct match for a dataset (we only have agginfos for
        # datasets) -> prep result
        res = get_status_dict(
            status='ok',
            metadata=dsmeta,
            # normpath to avoid trailing dot
            path=normpath(opj(ds.path, rpath)),
            type='dataset')
        # all info on the dataset is gathered -> eject
        yield res

    if (reporton is None
            and qtype != 'file') or reporton not in (None, 'files', 'all'):
        return

    #
    # everything that follows is about content metadata
    #
    # content info dicts have metadata stored under paths that are relative
    # to the dataset they were aggregated from
    rparentpath = relpath(rpath, start=containing_ds)

    # so we have some files to query, and we also have some content metadata
    contentmeta = _load_xz_json_stream(
        opj(agg_base_path, contentinfo_objloc),
        cache=cache['objcache']) if contentinfo_objloc else {}

    for fpath in [
            f for f in contentmeta.keys()
            if rparentpath == curdir or path_startswith(f, rparentpath)
    ]:
        # we might be onto something here, prepare result
        metadata = MetadataDict(contentmeta.get(fpath, {}))

        # we have to pull out the context for each extractor from the dataset
        # metadata
        for tlk in metadata:
            if tlk.startswith('@'):
                continue
            context = dsmeta.get(tlk, {}).get('@context', None)
            if context is None:
                continue
            metadata[tlk]['@context'] = context
        if '@context' in dsmeta:
            metadata['@context'] = dsmeta['@context']

        res = get_status_dict(
            status='ok',
            # the specific match within the containing dataset
            # normpath() because containing_ds could be `curdir`
            path=normpath(opj(ds.path, containing_ds, fpath)),
            # we can only match files
            type='file',
            metadata=metadata)
        yield res
Example #17
0
def _get_metadatarelevant_paths(ds, subds_relpaths):
    return (f for f in ds.repo.get_files()
            if not any(path_startswith(f, ex)
                       for ex in list(exclude_from_metadata) + subds_relpaths))