def test_path_startswith(): ok_(path_startswith('/a/b', '/a')) ok_(path_startswith('/a/b', '/a/b')) ok_(path_startswith('/a/b', '/a/b/')) ok_(path_startswith('/a/b/', '/a/b')) ok_(path_startswith('/a/b', '/')) ok_(path_startswith('/aaa/b/c', '/aaa')) nok_(path_startswith('/aaa/b/c', '/aa')) nok_(path_startswith('/a/b', '/a/c')) nok_(path_startswith('/a/b/c', '/a/c')) # must not mix relative and abs assert_raises(ValueError, path_startswith, 'a/b', '/a') assert_raises(ValueError, path_startswith, '/a/b', 'a')
def _get_untracked_content(dspath, report_untracked, paths=None): cmd = [ 'git', '--work-tree=.', 'status', '--porcelain', # file names NULL terminated '-z', # we never want to touch submodules, they cannot be untracked '--ignore-submodules=all', # fully untracked dirs as such, the rest as files '--untracked={}'.format(report_untracked) ] try: stdout, stderr = GitRunner(cwd=dspath).run(cmd, log_stderr=True, log_stdout=True, log_online=False, expect_stderr=False, shell=False, expect_fail=True) except CommandError as e: # TODO should we catch any and handle them in here? raise e if paths: paths = [r['path'] for r in paths] if len(paths) == 1 and paths[0] == dspath: # nothing to filter paths = None from datalad.utils import assure_unicode for line in stdout.split('\0'): if not line: continue line = assure_unicode(line) if not line.startswith('?? '): # nothing untracked, ignore, task of `diff` continue apath = opj( dspath, # strip state marker line[3:]) norm_apath = normpath(apath) if paths and not any(norm_apath == p or path_startswith(apath, p) for p in paths): # we got a whitelist for paths, don't report any other continue ap = dict(path=norm_apath, parentds=dspath, state='untracked', type='directory' if isdir(apath) else 'file') yield ap
def check_datasets_order(res, order='bottom-up'): """Check that all type=dataset records not violating the expected order it is somewhat weak test, i.e. records could be produced so we do not detect that order is violated, e.g. a/b c/d would satisfy either although they might be neither depth nor breadth wise. But this test would allow to catch obvious violations like a, a/b, a """ prev = None for r in res: if r.get('type') != 'dataset': continue if prev and r['path'] != prev: if order == 'bottom-up': assert_false(path_startswith(r['path'], prev)) elif order == 'top-down': assert_false(path_startswith(prev, r['path'])) else: raise ValueError(order) prev = r['path']
def _get_untracked_content(dspath, report_untracked, paths=None): cmd = ['git', '--work-tree=.', 'status', '--porcelain', # file names NULL terminated '-z', # we never want to touch submodules, they cannot be untracked '--ignore-submodules=all', # fully untracked dirs as such, the rest as files '--untracked={}'.format(report_untracked)] try: stdout, stderr = GitRunner(cwd=dspath).run( cmd, log_stderr=True, log_stdout=True, log_online=False, expect_stderr=False, shell=False, expect_fail=True) except CommandError as e: # TODO should we catch any and handle them in here? raise e if paths: paths = [r['path'] for r in paths] if len(paths) == 1 and paths[0] == dspath: # nothing to filter paths = None from datalad.utils import assure_unicode for line in stdout.split('\0'): if not line: continue line = assure_unicode(line) if not line.startswith('?? '): # nothing untracked, ignore, task of `diff` continue apath = opj( dspath, # strip state marker line[3:]) norm_apath = normpath(apath) if paths and not any(norm_apath == p or path_startswith(apath, p) for p in paths): # we got a whitelist for paths, don't report any other continue ap = dict( path=norm_apath, parentds=dspath, state='untracked', type='directory' if isdir(apath) else 'file') yield ap
def discover_dataset_trace_to_targets(basepath, targetpaths, current_trace, spec, includeds=None): """Discover the edges and nodes in a dataset tree to given target paths Parameters ---------- basepath : path Path to a start or top-level dataset. Really has to be a path to a dataset! targetpaths : list(path) Any non-zero number of paths that are termination points for the search algorithm. Can be paths to datasets, directories, or files (and any combination thereof). current_trace : list For a top-level call this should probably always be `[]` spec : dict `content_by_ds`-style dictionary that will receive information about the discovered datasets. Specifically, for each discovered dataset there will be an item with its path under the key (path) of the respective superdataset. includeds : sequence, optional Any paths given are treated as existing subdatasets, regardless of whether they can be found in the filesystem. Such subdatasets will appear under the key of the closest existing dataset in the `spec`. Returns ------- None Function calls itself recursively and populates `spec` dict in-place. Keys are dataset paths, values are sets of subdataset paths """ # convert to set for faster lookup includeds = includeds if isinstance(includeds, set) else \ set() if includeds is None else set(includeds) # this beast walks the directory tree from a given `basepath` until # it discovers any of the given `targetpaths` # if it finds one, it commits any accummulated trace of visited # datasets on this edge to the spec valid_repo = GitRepo.is_valid_repo(basepath) if valid_repo: # we are passing into a new dataset, extend the dataset trace current_trace = current_trace + [basepath] # this edge is not done, we need to try to reach any downstream # dataset undiscovered_ds = set(t for t in targetpaths) # if t != basepath) # whether anything in this directory matched a targetpath filematch = False if isdir(basepath): for p in listdir(basepath): p = ensure_unicode(opj(basepath, p)) if not isdir(p): if p in targetpaths: filematch = True # we cannot have anything below this one continue # OPT listdir might be large and we could have only few items # in `targetpaths` -- so traverse only those in spec which have # leading dir basepath # filter targets matching this downward path downward_targets = set(t for t in targetpaths if path_startswith(t, p)) if not downward_targets: continue # remove the matching ones from the "todo" list undiscovered_ds.difference_update(downward_targets) # go one deeper discover_dataset_trace_to_targets( p, downward_targets, current_trace, spec, includeds=includeds if not includeds else includeds.intersection(downward_targets)) undiscovered_ds = [ t for t in undiscovered_ds if includeds and path_is_subpath(t, current_trace[-1]) and t in includeds ] if filematch or basepath in targetpaths or undiscovered_ds: for i, p in enumerate(current_trace[:-1]): # TODO RF prepare proper annotated path dicts subds = spec.get(p, set()) subds.add(current_trace[i + 1]) spec[p] = subds if undiscovered_ds: spec[current_trace[-1]] = spec.get(current_trace[-1], set()).union(undiscovered_ds)
def annotated2content_by_ds(annotated, refds_path): """Helper to convert annotated paths into an old-style content_by_ds dict Only items with an `status` property value not equal to 'ok', 'notneeded', 'impossible', or 'error' are sorted. All others are considered as already processed and are returned in a separate list. Parameters ---------- annotated : list or generator Dicts with annotated path information. refds_path : str Path to the reference dataset the original path annotation was based on. Returns ------- dict, dict, list, list Dict keys are dataset paths, values are full info dicts. The keys in the second dict are paths to dataset, values are dicts with all known properties about those datasets. The first list contains all already "processed" results, which typically need to be re-yielded. The second list contains items (same type as dict values) for all annotated paths that have no associated parent dataset (i.e. nondataset paths) -- this list will be empty by default, unless `nondataset_path_status` was set to ''.""" content_by_ds = OrderedDict() ds_props = {} nondataset_paths = [] completed = [] for r in annotated: r_path = r['path'] if r.get('type', None) == 'dataset': # collect all properties of all known datasets from the annotated # paths dp = ds_props.get(r_path, {}) dp.update(r) ds_props[r_path] = dp if r.get('status', None) in ('ok', 'notneeded', 'impossible', 'error'): completed.append(r) continue parentds = r.get('parentds', None) appendto = [] # what entries, if any, to append r to if r.get('type', None) == 'dataset': # do dataset handling first, it is the more complex beast orig_request = r.get('orig_request', None) if parentds is None or refds_path is None or \ r.get('process_content', False) or (orig_request and ( orig_request == curdir or orig_request.endswith(dirsep) or orig_request.endswith('{}{}'.format(dirsep, curdir)))): # a dataset that floats by on its own OR # behave similar to rsync, a trailing '/' indicates the # content rather then the dataset itself # in both cases we want to process this part as part # of the same dataset, and not any potential parent appendto += [r_path] if parentds and refds_path and \ path_startswith(parentds, refds_path): # put also in parentds record if there is any, and the parent # is underneath or identical to the reference dataset appendto += [parentds] else: # files and dirs # common case, something with a parentds appendto += [parentds] for e in appendto: if e not in content_by_ds: content_by_ds[e] = [] content_by_ds[e] += [r] return content_by_ds, ds_props, completed, nondataset_paths
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)") # prep common result props res_kwargs = dict( action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive( refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = assure_list(path) if modified is not None: # modification detection would silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = resolve_path(p, ds=refds_path) if path_startswith(p, refds_path): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[r] = res yield res # preserve non-existing paths to be silently killed by modification # detection and append them to requested_paths again after detection. # TODO: This might be melted in with treatment of non dataset paths # above. Re-appending those paths seems to be better than yielding # directly to avoid code duplication, since both cases later on are # dealt with again. preserved_paths = [] if requested_paths: [preserved_paths.append(r) for r in requested_paths if not lexists(r['path'] if isinstance(r, dict) else r)] # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) from itertools import chain # re-append the preserved paths: requested_paths = chain(requested_paths, iter(preserved_paths)) # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path if not islink(path) else normpath(opj(path, pardir)) else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or ( refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root(normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not path_startswith(dspath, refds_path): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) subdss = containing_ds.subdatasets( fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get( 'status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset(parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return
def _query_aggregated_metadata_singlepath( ds, agginfos, agg_base_path, qap, reporton, cache, dsmeta, contentinfo_objloc): """This is the workhorse of query_aggregated_metadata() for querying for a single path""" rpath = qap['rpath'] containing_ds = qap['metaprovider'] qtype = qap.get('type', None) if (rpath == op.curdir or rpath == containing_ds) and \ ((reporton is None and qtype == 'dataset') or \ reporton in ('datasets', 'all')): # this is a direct match for a dataset (we only have agginfos for # datasets) -> prep result res = get_status_dict( status='ok', metadata=dsmeta, # normpath to avoid trailing dot path=op.normpath(op.join(ds.path, rpath)), type='dataset') # all info on the dataset is gathered -> eject yield res if (reporton is None and qtype != 'file') or reporton not in (None, 'files', 'all'): return # # everything that follows is about content metadata # # content info dicts have metadata stored under paths that are relative # to the dataset they were aggregated from rparentpath = op.relpath(rpath, start=containing_ds) # so we have some files to query, and we also have some content metadata contentmeta = _load_xz_json_stream( op.join(agg_base_path, contentinfo_objloc), cache=cache['objcache']) if contentinfo_objloc else {} for fpath in [f for f in contentmeta.keys() if rparentpath == op.curdir or path_startswith(f, rparentpath)]: # we might be onto something here, prepare result metadata = contentmeta.get(fpath, {}) # we have to pull out the context for each extractor from the dataset # metadata for tlk in metadata: if tlk.startswith('@'): continue context = dsmeta.get(tlk, {}).get('@context', None) if context is None: continue metadata[tlk]['@context'] = context if '@context' in dsmeta: metadata['@context'] = dsmeta['@context'] res = get_status_dict( status='ok', # the specific match within the containing dataset # normpath() because containing_ds could be `op.curdir` path=op.normpath(op.join(ds.path, containing_ds, fpath)), # we can only match files type='file', metadata=metadata) yield res
def _get_submodules(dspath, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): if not GitRepo.is_valid_repo(dspath): return modinfo = _parse_gitmodules(dspath) # write access parser parser = None # TODO bring back in more global scope from below once segfaults are # figured out #if set_property or delete_property: # gitmodule_path = opj(dspath, ".gitmodules") # parser = GitConfigParser( # gitmodule_path, read_only=False, merge_includes=False) # parser.read() # put in giant for-loop to be able to yield results before completion for sm in _parse_git_submodules(dspath): if contains and not path_startswith(contains, sm['path']): # we are not looking for this subds, because it doesn't # match the target path continue sm.update(modinfo.get(sm['path'], {})) if set_property or delete_property: gitmodule_path = opj(dspath, ".gitmodules") parser = GitConfigParser(gitmodule_path, read_only=False, merge_includes=False) parser.read() # do modifications now before we read the info out for reporting # use 'submodule "NAME"' section ID style as this seems to be the default submodule_section = 'submodule "{}"'.format(sm['gitmodule_name']) # first deletions for dprop in assure_list(delete_property): parser.remove_option(submodule_section, dprop) # also kick from the info we just read above sm.pop('gitmodule_{}'.format(dprop), None) # and now setting values for sprop in assure_list(set_property): prop, val = sprop if val.startswith('<') and val.endswith('>') and '{' in val: # expand template string val = val[1:-1].format( **dict(sm, refds_relpath=relpath(sm['path'], refds_path), refds_relname=relpath(sm['path'], refds_path). replace(os.sep, '-'))) parser.set_value(submodule_section, prop, val) # also add to the info we just read above sm['gitmodule_{}'.format(prop)] = val Dataset(dspath).add( '.gitmodules', to_git=True, message='[DATALAD] modified subdataset properties') # let go of resources, locks, ... parser.release() #common = commonprefix((with_pathsep(subds), with_pathsep(path))) #if common.endswith(sep) and common == with_pathsep(subds): # candidates.append(common) subdsres = get_status_dict('subdataset', status='ok', type='dataset', logger=lgr) subdsres.update(sm) subdsres['parentds'] = dspath if not bottomup and \ (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled): yield subdsres # expand list with child submodules. keep all paths relative to parent # and convert jointly at the end if recursive and \ (recursion_limit in (None, 'existing') or (isinstance(recursion_limit, int) and recursion_limit > 1)): for r in _get_submodules( sm['path'], fulfilled, recursive, (recursion_limit - 1) if isinstance(recursion_limit, int) else recursion_limit, contains, bottomup, set_property, delete_property, refds_path): yield r if bottomup and \ (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled): yield subdsres if parser is not None: # release parser lock manually, auto-cleanup is not reliable in PY3 parser.release()
def __call__(path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)" ) # prep common result props res_kwargs = dict(action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive(refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = ensure_list(path) if modified is not None: # modification detection would silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = _resolve_path(p, ds=refds_path) if path_startswith(p, refds_path): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[r] = res yield res # preserve non-existing paths to be silently killed by modification # detection and append them to requested_paths again after detection. # TODO: This might be melted in with treatment of non dataset paths # above. Re-appending those paths seems to be better than yielding # directly to avoid code duplication, since both cases later on are # dealt with again. preserved_paths = [] if requested_paths: [ preserved_paths.append(r) for r in requested_paths if not lexists(r['path'] if isinstance(r, dict) else r) ] # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) from itertools import chain # re-append the preserved paths: requested_paths = chain(requested_paths, iter(preserved_paths)) # Possibly to be used "cache" of known subdatasets per each parent # to avoid re-querying subdatasets per each path. The assumption here # is that the list of sub-datasets for a given parent should not change # through the execution of this loop, which (hypothetically) could be # incorrect while annotating paths for some commands. # TODO: verify this assumption and possibly add an argument to turn # caching off if/when needed, or provide some other way to invalidate # it subdss_cache = {} # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path if not islink(path) else normpath( opj(path, pardir)) else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or (refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root( normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not path_startswith(dspath, refds_path): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) # Possibly "cache" the list of known subdss for parents we # have encountered so far if parent in subdss_cache: subdss = subdss_cache[parent] else: subdss = containing_ds.subdatasets(fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') subdss_cache[parent] = subdss if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get('status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset( parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change= force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return
def _get_submodules(dspath, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): if not GitRepo.is_valid_repo(dspath): return modinfo = _parse_gitmodules(dspath) # write access parser parser = None # TODO bring back in more global scope from below once segfaults are # figured out #if set_property or delete_property: # gitmodule_path = opj(dspath, ".gitmodules") # parser = GitConfigParser( # gitmodule_path, read_only=False, merge_includes=False) # parser.read() # put in giant for-loop to be able to yield results before completion for sm in _parse_git_submodules(dspath): if contains and not path_startswith(contains, sm['path']): # we are not looking for this subds, because it doesn't # match the target path continue sm.update(modinfo.get(sm['path'], {})) if set_property or delete_property: gitmodule_path = opj(dspath, ".gitmodules") parser = GitConfigParser( gitmodule_path, read_only=False, merge_includes=False) parser.read() # do modifications now before we read the info out for reporting # use 'submodule "NAME"' section ID style as this seems to be the default submodule_section = 'submodule "{}"'.format(sm['gitmodule_name']) # first deletions for dprop in assure_list(delete_property): parser.remove_option(submodule_section, dprop) # also kick from the info we just read above sm.pop('gitmodule_{}'.format(dprop), None) # and now setting values for sprop in assure_list(set_property): prop, val = sprop if val.startswith('<') and val.endswith('>') and '{' in val: # expand template string val = val[1:-1].format( **dict( sm, refds_relpath=relpath(sm['path'], refds_path), refds_relname=relpath(sm['path'], refds_path).replace(os.sep, '-'))) parser.set_value( submodule_section, prop, val) # also add to the info we just read above sm['gitmodule_{}'.format(prop)] = val Dataset(dspath).add( '.gitmodules', to_git=True, message='[DATALAD] modified subdataset properties') # let go of resources, locks, ... parser.release() #common = commonprefix((with_pathsep(subds), with_pathsep(path))) #if common.endswith(sep) and common == with_pathsep(subds): # candidates.append(common) subdsres = get_status_dict( 'subdataset', status='ok', type='dataset', logger=lgr) subdsres.update(sm) subdsres['parentds'] = dspath if not bottomup and \ (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled): yield subdsres # expand list with child submodules. keep all paths relative to parent # and convert jointly at the end if recursive and \ (recursion_limit in (None, 'existing') or (isinstance(recursion_limit, int) and recursion_limit > 1)): for r in _get_submodules( sm['path'], fulfilled, recursive, (recursion_limit - 1) if isinstance(recursion_limit, int) else recursion_limit, contains, bottomup, set_property, delete_property, refds_path): yield r if bottomup and \ (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled): yield subdsres if parser is not None: # release parser lock manually, auto-cleanup is not reliable in PY3 parser.release()
def __call__( path=None, force=False, description=None, dataset=None, no_annex=False, save=True, annex_version=None, annex_backend='MD5E', native_metadata_type=None, shared_access=None, git_opts=None, annex_opts=None, annex_init_opts=None, text_no_annex=None ): # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD if path and dataset: # Given a path and a dataset (path) not pointing to installed # dataset if not dataset.is_installed(): msg = "No installed dataset at %s found." % dataset.path dsroot = get_dataset_root(dataset.path) if dsroot: msg += " If you meant to add to the %s dataset, use that path " \ "instead but remember that if dataset is provided, " \ "relative paths are relative to the top of the " \ "dataset." % dsroot raise ValueError(msg) # sanity check first if git_opts: lgr.warning( "`git_opts` argument is presently ignored, please complain!") if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if annex_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex and declaring no " "annex repo.") if annex_init_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex init and declaring no " "annex repo.") if not isinstance(force, bool): raise ValueError("force should be bool, got %r. Did you mean to provide a 'path'?" % force) annotated_paths = AnnotatePaths.__call__( # nothing given explicitly, assume create fresh right here path=path if path else getpwd() if dataset is None else None, dataset=dataset, recursive=False, action='create', # we need to know whether we have to check for potential # subdataset collision force_parentds_discovery=True, # it is absolutely OK to have something that does not exist unavailable_path_status='', unavailable_path_msg=None, # if we have a dataset given that actually exists, we want to # fail if the requested path is not in it nondataset_path_status='error' \ if isinstance(dataset, Dataset) and dataset.is_installed() else '', on_failure='ignore') path = None for r in annotated_paths: if r['status']: # this is dealt with already yield r continue if path is not None: raise ValueError("`create` can only handle single target path or dataset") path = r if len(annotated_paths) and path is None: # we got something, we complained already, done return # we know that we need to create a dataset at `path` assert(path is not None) # prep for yield path.update({'logger': lgr, 'type': 'dataset'}) # just discard, we have a new story to tell path.pop('message', None) if 'parentds' in path: try: subds = next( sds for sds in Subdatasets.__call__( dataset=path['parentds'], # any known fulfilled=None, recursive=False, result_xfm='paths') if path_startswith(path['path'], sds) ) path.update({ 'status': 'error', 'message': ('collision with known subdataset %s/ in dataset %s', relpath(subds, path['parentds']), path['parentds'])}) yield path return except StopIteration: # all good pass if git_opts is None: git_opts = {} if shared_access: # configure `git --shared` value git_opts['shared'] = shared_access # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = dataset if isinstance(dataset, Dataset) and dataset.path == path['path'] \ else Dataset(path['path']) # don't create in non-empty directory without `force`: if isdir(tbds.path) and listdir(tbds.path) != [] and not force: path.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`force` option to ignore'}) yield path return if no_annex: lgr.info("Creating a new git repo at %s", tbds.path) GitRepo( tbds.path, url=None, create=True, git_opts=git_opts) else: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", tbds.path) tbrepo = AnnexRepo( tbds.path, url=None, create=True, backend=annex_backend, version=annex_version, description=description, git_opts=git_opts, annex_opts=annex_opts, annex_init_opts=annex_init_opts ) if text_no_annex: git_attributes_file = opj(tbds.path, '.gitattributes') with open(git_attributes_file, 'a') as f: f.write('* annex.largefiles=(not(mimetype=text/*))\n') tbrepo.add([git_attributes_file], git=True) tbrepo.commit( "Instructed annex to add text files to git", _datalad_msg=True, files=[git_attributes_file] ) if native_metadata_type is not None: if not isinstance(native_metadata_type, list): native_metadata_type = [native_metadata_type] for nt in native_metadata_type: tbds.config.add('datalad.metadata.nativetype', nt) # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' if id_var in tbds.config: # make sure we reset this variable completely, in case of a re-create tbds.config.unset(id_var, where='dataset') tbds.config.add( id_var, tbds.id if tbds.id is not None else uuid.uuid1().urn.split(':')[-1], where='dataset') # make sure that v6 annex repos never commit content under .datalad with open(opj(tbds.path, '.datalad', '.gitattributes'), 'a') as gitattr: # TODO this will need adjusting, when annex'ed aggregate meta data # comes around gitattr.write('# Text files (according to file --mime-type) are added directly to git.\n') gitattr.write('# See http://git-annex.branchable.com/tips/largefiles/ for more info.\n') gitattr.write('** annex.largefiles=nothing\n') # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbds.add('.datalad', to_git=True, save=save, message='[DATALAD] new dataset') # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if save and isinstance(dataset, Dataset) and dataset.path != tbds.path: # we created a dataset in another dataset # -> make submodule for r in dataset.add( tbds.path, save=True, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r path.update({'status': 'ok'}) yield path
def discover_dataset_trace_to_targets(basepath, targetpaths, current_trace, spec, includeds=None): """Discover the edges and nodes in a dataset tree to given target paths Parameters ---------- basepath : path Path to a start or top-level dataset. Really has to be a path to a dataset! targetpaths : list(path) Any non-zero number of paths that are termination points for the search algorithm. Can be paths to datasets, directories, or files (and any combination thereof). current_trace : list For a top-level call this should probably always be `[]` spec : dict `content_by_ds`-style dictionary that will receive information about the discovered datasets. Specifically, for each discovered dataset there will be in item with its path under the key (path) of the respective superdataset. includeds : sequence, optional Any paths given are treated as existing subdatasets, regardless of whether they can be found in the filesystem. Such subdatasets will appear under the key of the closest existing dataset in the `spec`. Returns ------- None Function calls itself recursively and populates `spec` dict in-place. Keys are dataset paths, values are sets of subdataset paths """ # convert to set for faster lookup includeds = includeds if isinstance(includeds, set) else \ set() if includeds is None else set(includeds) # this beast walks the directory tree from a given `basepath` until # it discovers any of the given `targetpaths` # if it finds one, it commits any accummulated trace of visited # datasets on this edge to the spec valid_repo = GitRepo.is_valid_repo(basepath) if valid_repo: # we are passing into a new dataset, extend the dataset trace current_trace = current_trace + [basepath] # this edge is not done, we need to try to reach any downstream # dataset undiscovered_ds = set(t for t in targetpaths) # if t != basepath) # whether anything in this directory matched a targetpath filematch = False if isdir(basepath): for p in listdir(basepath): p = assure_unicode(opj(basepath, p)) if not isdir(p): if p in targetpaths: filematch = True # we cannot have anything below this one continue # OPT listdir might be large and we could have only few items # in `targetpaths` -- so traverse only those in spec which have # leading dir basepath # filter targets matching this downward path downward_targets = set( t for t in targetpaths if path_startswith(t, p)) if not downward_targets: continue # remove the matching ones from the "todo" list undiscovered_ds.difference_update(downward_targets) # go one deeper discover_dataset_trace_to_targets( p, downward_targets, current_trace, spec, includeds=includeds if not includeds else includeds.intersection( downward_targets)) undiscovered_ds = [t for t in undiscovered_ds if includeds and path_is_subpath(t, current_trace[-1]) and t in includeds] if filematch or basepath in targetpaths or undiscovered_ds: for i, p in enumerate(current_trace[:-1]): # TODO RF prepare proper annotated path dicts subds = spec.get(p, set()) subds.add(current_trace[i + 1]) spec[p] = subds if undiscovered_ds: spec[current_trace[-1]] = spec.get(current_trace[-1], set()).union( undiscovered_ds)
def _get_metadatarelevant_paths(ds, subds_relpaths): return (f for f in ds.repo.get_files() if not any( path_startswith(f, ex) for ex in list(exclude_from_metadata) + subds_relpaths))
def _query_aggregated_metadata_singlepath(ds, agginfos, agg_base_path, qap, reporton, cache, dsmeta, contentinfo_objloc): """This is the workhorse of query_aggregated_metadata() for querying for a single path""" rpath = qap['rpath'] containing_ds = qap['metaprovider'] qtype = qap.get('type', None) if (rpath == curdir or rpath == containing_ds) and \ ((reporton is None and qtype == 'dataset') or \ reporton in ('datasets', 'all')): # this is a direct match for a dataset (we only have agginfos for # datasets) -> prep result res = get_status_dict( status='ok', metadata=dsmeta, # normpath to avoid trailing dot path=normpath(opj(ds.path, rpath)), type='dataset') # all info on the dataset is gathered -> eject yield res if (reporton is None and qtype != 'file') or reporton not in (None, 'files', 'all'): return # # everything that follows is about content metadata # # content info dicts have metadata stored under paths that are relative # to the dataset they were aggregated from rparentpath = relpath(rpath, start=containing_ds) # so we have some files to query, and we also have some content metadata contentmeta = _load_xz_json_stream( opj(agg_base_path, contentinfo_objloc), cache=cache['objcache']) if contentinfo_objloc else {} for fpath in [ f for f in contentmeta.keys() if rparentpath == curdir or path_startswith(f, rparentpath) ]: # we might be onto something here, prepare result metadata = MetadataDict(contentmeta.get(fpath, {})) # we have to pull out the context for each extractor from the dataset # metadata for tlk in metadata: if tlk.startswith('@'): continue context = dsmeta.get(tlk, {}).get('@context', None) if context is None: continue metadata[tlk]['@context'] = context if '@context' in dsmeta: metadata['@context'] = dsmeta['@context'] res = get_status_dict( status='ok', # the specific match within the containing dataset # normpath() because containing_ds could be `curdir` path=normpath(opj(ds.path, containing_ds, fpath)), # we can only match files type='file', metadata=metadata) yield res
def _get_metadatarelevant_paths(ds, subds_relpaths): return (f for f in ds.repo.get_files() if not any(path_startswith(f, ex) for ex in list(exclude_from_metadata) + subds_relpaths))