def _discover_subdatasets_recursively( discovered, top, trace, recursion_limit): # this beast walks the directory tree from a give `top` directory # and discovers valid repos that are scattered around, regardless # of whether they are already subdatasets or not # `trace` must be a list that has at least one element (the base # dataset) if recursion_limit is not None and len(trace) > recursion_limit: return if not isdir(top): return if not op.islink(top) and GitRepo.is_valid_repo(top): if top in discovered: # this was found already, assume everything beneath it too return discovered[top] = dict( path=top, # and its content process_content=True, type='dataset', parentds=trace[-1]) # new node in the trace down trace = trace + [top] for path in listdir(top): path = opj(top, path) if not isdir(path): continue # next level down _discover_subdatasets_recursively( discovered, path, trace, recursion_limit)
def _parse_git_submodules(ds, paths): """All known ones with some properties""" if not (ds.pathobj / ".gitmodules").exists(): # easy way out. if there is no .gitmodules file # we cannot have (functional) subdatasets return if paths: paths = [ p.relative_to(ds.pathobj) for p in paths if ds.pathobj == p or ds.pathobj in p.parents ] if not paths: # we had path contraints, but none matched this dataset return for path, props in iteritems( ds.repo.get_content_info(paths=paths, ref=None, untracked='no', eval_file_type=False)): if props.get('type', None) != 'dataset': continue if ds.pathobj != ds.repo.pathobj: props['path'] = ds.pathobj / path.relative_to(ds.repo.pathobj) else: props['path'] = path if not path.exists() or not GitRepo.is_valid_repo(text_type(path)): props['state'] = 'absent' # TODO kill this after some time. We used to do custom things here # and gitshasum was called revision. Be nice and duplicate for a bit # wipe out when patience is gone props['revision'] = props['gitshasum'] yield props
def _discover_subdatasets_recursively(discovered, top, trace, recursion_limit): # this beast walks the directory tree from a give `top` directory # and discovers valid repos that are scattered around, regardless # of whether they are already subdatasets or not # `trace` must be a list that has at least one element (the base # dataset) if recursion_limit is not None and len(trace) > recursion_limit: return if not isdir(top): return if not op.islink(top) and GitRepo.is_valid_repo(top): if top in discovered: # this was found already, assume everything beneath it too return discovered[top] = dict( path=top, # and its content process_content=True, type='dataset', parentds=trace[-1]) # new node in the trace down trace = trace + [top] for path in listdir(top): path = opj(top, path) if not isdir(path): continue # next level down _discover_subdatasets_recursively(discovered, path, trace, recursion_limit)
def test_submodule_deinit(path): from datalad.support.annexrepo import AnnexRepo top_repo = AnnexRepo(path, create=False) eq_({'subm 1', '2'}, {s.name for s in top_repo.get_submodules()}) # note: here init=True is ok, since we are using it just for testing with swallow_logs(new_level=logging.WARN) as cml: top_repo.update_submodule('subm 1', init=True) assert_in('Do not use update_submodule with init=True', cml.out) top_repo.update_submodule('2', init=True) # ok_(all([s.module_exists() for s in top_repo.get_submodules()])) # TODO: old assertion above if non-bare? (can't use "direct mode" in test_gitrepo) # Alternatively: New testrepo (plain git submodules) and have a dedicated # test for annexes in addition ok_(all([GitRepo.is_valid_repo(op.join(top_repo.path, s.path)) for s in top_repo.get_submodules()])) # modify submodule: with open(op.join(top_repo.path, 'subm 1', 'file_ut.dat'), "w") as f: f.write("some content") assert_raises(CommandError, top_repo.deinit_submodule, 'sub1') # using force should work: top_repo.deinit_submodule('subm 1', force=True) ok_(not top_repo.repo.submodule('subm 1').module_exists())
def test_install_simple_local(src, path): origin = Dataset(path) # now install it somewhere else ds = install(path, source=src, description='mydummy') eq_(ds.path, path) ok_(ds.is_installed()) if not isinstance(origin.repo, AnnexRepo): # this means it is a GitRepo ok_(isinstance(origin.repo, GitRepo)) # stays plain Git repo ok_(isinstance(ds.repo, GitRepo)) ok_(not isinstance(ds.repo, AnnexRepo)) ok_(GitRepo.is_valid_repo(ds.path)) eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt'}) ok_clean_git(path, annex=False) else: # must be an annex ok_(isinstance(ds.repo, AnnexRepo)) ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False)) eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt', 'test-annex.dat'}) ok_clean_git(path, annex=True) # no content was installed: ok_(not ds.repo.file_has_content('test-annex.dat')) uuid_before = ds.repo.uuid eq_(ds.repo.get_description(), 'mydummy') # installing it again, shouldn't matter: res = install(path, source=src, result_xfm=None, return_type='list') assert_status('notneeded', res) ok_(ds.is_installed()) if isinstance(origin.repo, AnnexRepo): eq_(uuid_before, ds.repo.uuid)
def test_install_dataset_from_just_source(src_repo=None, path=None): src_ds = Dataset(src_repo).create(result_renderer='disabled', force=True) src_ds.save(['INFO.txt', 'test.dat'], to_git=True) src_ds.save('test-annex.dat', to_git=False) # equivalent repo on github: src_url = "https://github.com/datalad/testrepo--basic--r1.git" sources = [ src_ds.path, get_local_file_url(src_ds.path, compatibility='git') ] if not dl_cfg.get('datalad.tests.nonetwork'): sources.append(src_url) for url in sources: with chpwd(path, mkdir=True): ds = install(source=url) ok_startswith(ds.path, path) ok_(ds.is_installed()) ok_(GitRepo.is_valid_repo(ds.path)) assert_repo_status(ds.path, annex=None) assert_in('INFO.txt', ds.repo.get_indexed_files()) # cleanup before next iteration rmtree(path)
def test_install_simple_local(src, path): origin = Dataset(path) # now install it somewhere else ds = install(path, source=src, description='mydummy') eq_(ds.path, path) ok_(ds.is_installed()) if not isinstance(origin.repo, AnnexRepo): # this means it is a GitRepo ok_(isinstance(origin.repo, GitRepo)) # stays plain Git repo ok_(isinstance(ds.repo, GitRepo)) ok_(not isinstance(ds.repo, AnnexRepo)) ok_(GitRepo.is_valid_repo(ds.path)) eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt'}) ok_clean_git(path, annex=False) else: # must be an annex ok_(isinstance(ds.repo, AnnexRepo)) ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False)) eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt', 'test-annex.dat'}) ok_clean_git(path, annex=True) # no content was installed: ok_(not ds.repo.file_has_content('test-annex.dat')) uuid_before = ds.repo.uuid eq_(ds.repo.get_description(), 'mydummy') # installing it again, shouldn't matter: res = install(path, source=src, result_xfm=None, return_type='list') assert_status('notneeded', res) ok_(ds.is_installed()) if isinstance(origin.repo, AnnexRepo): eq_(uuid_before, ds.repo.uuid)
def _parse_git_submodules(ds, paths): """All known ones with some properties""" if not (ds.pathobj / ".gitmodules").exists(): # easy way out. if there is no .gitmodules file # we cannot have (functional) subdatasets return if paths: paths = [ p.relative_to(ds.pathobj) for p in paths if ds.pathobj == p or ds.pathobj in p.parents] if not paths: # we had path contraints, but none matched this dataset return for path, props in iteritems(ds.repo.get_content_info( paths=paths, ref=None, untracked='no', eval_file_type=False)): if props.get('type', None) != 'dataset': continue if ds.pathobj != ds.repo.pathobj: props['path'] = ds.pathobj / path.relative_to(ds.repo.pathobj) else: props['path'] = path if not path.exists() or not GitRepo.is_valid_repo(text_type(path)): props['state'] = 'absent' # TODO kill this after some time. We used to do custom things here # and gitshasum was called revision. Be nice and duplicate for a bit # wipe out when patience is gone props['revision'] = props['gitshasum'] yield props
def test_submodule_deinit(path): from datalad.support.annexrepo import AnnexRepo top_repo = AnnexRepo(path, create=False) eq_({'subm 1', '2'}, {s.name for s in top_repo.get_submodules()}) # note: here init=True is ok, since we are using it just for testing with swallow_logs(new_level=logging.WARN) as cml: top_repo.update_submodule('subm 1', init=True) assert_in('Do not use update_submodule with init=True', cml.out) top_repo.update_submodule('2', init=True) # ok_(all([s.module_exists() for s in top_repo.get_submodules()])) # TODO: old assertion above if non-bare? (can't use "direct mode" in test_gitrepo) # Alternatively: New testrepo (plain git submodules) and have a dedicated # test for annexes in addition ok_( all([ GitRepo.is_valid_repo(op.join(top_repo.path, s.path)) for s in top_repo.get_submodules() ])) # modify submodule: with open(op.join(top_repo.path, 'subm 1', 'file_ut.dat'), "w") as f: f.write("some content") assert_raises(CommandError, top_repo.deinit_submodule, 'sub1') # using force should work: top_repo.deinit_submodule('subm 1', force=True) ok_(not top_repo.repo.submodule('subm 1').module_exists())
def _parse_git_submodules(ds_pathobj, repo, paths): """All known ones with some properties""" if not (ds_pathobj / ".gitmodules").exists(): # easy way out. if there is no .gitmodules file # we cannot have (functional) subdatasets return if paths: paths_outside, paths_at_or_in = partition( paths, lambda p: ds_pathobj == p or ds_pathobj in p.parents) paths = [p.relative_to(ds_pathobj) for p in paths_at_or_in] if not paths: if any(p for p in paths_outside if p in ds_pathobj.parents): # The dataset is directly under some specified path, so include # it. paths = None else: # we had path contraints, but none matched this dataset return for props in repo.get_submodules_(paths=paths): path = props["path"] if props.get('type', None) != 'dataset': continue if ds_pathobj != repo.pathobj: props['path'] = ds_pathobj / path.relative_to(repo.pathobj) else: props['path'] = path if not path.exists() or not GitRepo.is_valid_repo(str(path)): props['state'] = 'absent' # TODO kill this after some time. We used to do custom things here # and gitshasum was called revision. Be nice and duplicate for a bit # wipe out when patience is gone props['revision'] = props['gitshasum'] yield props
def _install_necessary_subdatasets( ds, path, reckless, refds_path, description=None): """Installs subdatasets of `ds`, that are necessary to obtain in order to have access to `path`. Gets the subdataset containing `path` regardless of whether or not it was already installed. While doing so, installs everything necessary in between the uppermost installed one and `path`. Note: `ds` itself has to be installed. Parameters ---------- ds: Dataset path: str reckless: bool """ # figuring out what dataset to start with, --contains limits --recursive # to visit only subdataset on the trajectory to the target path subds_trail = ds.subdatasets(contains=path, recursive=True) if not subds_trail: # there is not a single known subdataset (installed or not) # for this path -- job done return # otherwise we start with the one deepest down cur_subds = subds_trail[-1] while not GitRepo.is_valid_repo(cur_subds['path']): # install using helper that give some flexibility regarding where to # get the module from try: sd = _install_subds_from_flexible_source( Dataset(cur_subds['parentds']), relpath(cur_subds['path'], start=cur_subds['parentds']), cur_subds['gitmodule_url'], reckless, description=description) except Exception as e: # skip all of downstairs, if we didn't manage to install subdataset yield get_status_dict( 'install', path=cur_subds['path'], type='dataset', status='error', logger=lgr, refds=refds_path, message=("Installation of subdatasets %s failed with exception: %s", cur_subds['path'], exc_str(e))) return # report installation, whether it helped or not yield get_status_dict( 'install', ds=sd, status='ok', logger=lgr, refds=refds_path, message=("Installed subdataset in order to get %s", path)) # now check whether the just installed subds brought us any closer to # the target path subds_trail = sd.subdatasets(contains=path, recursive=False) if not subds_trail: # no (newly available) subdataset get's us any closer return # next round cur_subds = subds_trail[-1]
def _install_necessary_subdatasets( ds, path, reckless, refds_path, description=None): """Installs subdatasets of `ds`, that are necessary to obtain in order to have access to `path`. Gets the subdataset containing `path` regardless of whether or not it was already installed. While doing so, installs everything necessary in between the uppermost installed one and `path`. Note: `ds` itself has to be installed. Parameters ---------- ds: Dataset path: str reckless: bool """ # figuring out what dataset to start with, --contains limits --recursive # to visit only subdataset on the trajectory to the target path subds_trail = ds.subdatasets(contains=path, recursive=True) if not subds_trail: # there is not a single known subdataset (installed or not) # for this path -- job done return # otherwise we start with the one deepest down cur_subds = subds_trail[-1] while not GitRepo.is_valid_repo(cur_subds['path']): # install using helper that give some flexibility regarding where to # get the module from try: sd = _install_subds_from_flexible_source( Dataset(cur_subds['parentds']), relpath(cur_subds['path'], start=cur_subds['parentds']), cur_subds['gitmodule_url'], reckless, description=description) except Exception as e: # skip all of downstairs, if we didn't manage to install subdataset yield get_status_dict( 'install', path=cur_subds['path'], type='dataset', status='error', logger=lgr, refds=refds_path, message=("Installation of subdatasets %s failed with exception: %s", cur_subds['path'], exc_str(e))) return # report installation, whether it helped or not yield get_status_dict( 'install', ds=sd, status='ok', logger=lgr, refds=refds_path, message=("Installed subdataset in order to get %s", path)) # now check whether the just installed subds brought us any closer to # the target path subds_trail = sd.subdatasets(contains=path, recursive=False) if not subds_trail: # no (newly available) subdataset get's us any closer return # next round cur_subds = subds_trail[-1]
def download_dataset(repo=default_testing_repo, remote_path=None, local_folder=None): """ Download a dataset with datalad client. By default it download the "NeuralEnsemble/ephy_testing_data" on gin platform which is used for neo testing. Usage: download_dataset( repo='https://gin.g-node.org/NeuralEnsemble/ephy_testing_data', remote_path='blackrock/blackrock_2_1', local_folder='/home/myname/Documents/') Parameters ---------- repo: str The url of the repo. If None then 'https://gin.g-node.org/NeuralEnsemble/ephy_testing_data' is used remote_path: str of Path The distant path to retrieve (file or folder) local_folder: str or Path or None The local folder where to download the data. If None, a default project testing folder is used. Default: None Returns: -------- local_path: The local path of the downloaded file or folder """ assert HAVE_DATALAD, 'You need to install datalad' if local_folder is None: global local_testing_data_folder local_folder = local_testing_data_folder local_folder = Path(local_folder) if local_folder.exists() and GitRepo.is_valid_repo(local_folder): dataset = datalad.api.Dataset(path=local_folder) # make sure git repo is in clean state repo = dataset.repo repo.call_git(['checkout', '--force', 'master']) dataset.update(merge=True) else: dataset = datalad.api.install(path=local_folder, source=repo) if remote_path is None: print('Bad boy: you have to provide "remote_path"') return dataset.get(remote_path) local_path = local_folder / remote_path return local_path
def test_install_dataset_from_just_source(url, path): with chpwd(path, mkdir=True): ds = install(source=url) ok_startswith(ds.path, path) ok_(ds.is_installed()) ok_(GitRepo.is_valid_repo(ds.path)) ok_clean_git(ds.path, annex=None) assert_in('INFO.txt', ds.repo.get_indexed_files())
def discover_dataset_trace_to_targets(basepath, targetpaths, current_trace, spec): """Discover the edges and nodes in a dataset tree to given target paths Parameters ---------- basepath : path Path to a start or top-level dataset. Really has to be a path to a dataset! targetpaths : list(path) Any non-zero number of path that are termination points for the search algorithm. Can be paths to datasets, directories, or files (and any combination thereof). current_trace : list For a top-level call this should probably always be `[]` spec : dict `content_by_ds`-style dictionary that will receive information about the discovered datasets. Specifically, for each discovered dataset there will be in item with its path under the key (path) of the respective superdataset. Returns ------- None Function calls itself recursively and populates `spec` in-place. """ # this beast walks the directory tree from a given `basepath` until # it discovers any of the given `targetpaths` # if it finds one, it commits any accummulated trace of visited # datasets on this edge to the spec valid_repo = GitRepo.is_valid_repo(basepath) if valid_repo: # we are passing into a new dataset, extend the dataset trace current_trace = current_trace + [basepath] if basepath in targetpaths: # found a targetpath, commit the trace for i, p in enumerate(current_trace[:-1]): # TODO RF prepare proper annotated path dicts spec[p] = list(set(spec.get(p, []) + [current_trace[i + 1]])) if not isdir(basepath): # nothing underneath this one -> done return # this edge is not done, we need to try to reach any downstream # dataset for p in listdir(basepath): if valid_repo and p == '.git': # ignore gitdir to speed things up continue p = opj(basepath, p) if all(t != p and not t.startswith(_with_sep(p)) for t in targetpaths): # OPT listdir might be large and we could have only few items # in `targetpaths` -- so traverse only those in spec which have # leading dir basepath continue # we need to call this even for non-directories, to be able to match # file target paths discover_dataset_trace_to_targets(p, targetpaths, current_trace, spec)
def test_install_dataset_from_just_source(url, path): with chpwd(path, mkdir=True): ds = install(source=url) ok_startswith(ds.path, path) ok_(ds.is_installed()) ok_(GitRepo.is_valid_repo(ds.path)) ok_clean_git(ds.path, annex=None) assert_in('INFO.txt', ds.repo.get_indexed_files())
def test_clone_dataset_from_just_source(url, path): with chpwd(path, mkdir=True): ds = clone(url, result_xfm='datasets', return_type='item-or-list') ok_startswith(ds.path, path) ok_(ds.is_installed()) ok_(GitRepo.is_valid_repo(ds.path)) ok_clean_git(ds.path, annex=None) assert_in('INFO.txt', ds.repo.get_indexed_files())
def test_clone_dataset_from_just_source(url, path): with chpwd(path, mkdir=True): ds = clone(url, result_xfm='datasets', return_type='item-or-list') ok_startswith(ds.path, path) ok_(ds.is_installed()) ok_(GitRepo.is_valid_repo(ds.path)) ok_clean_git(ds.path, annex=None) assert_in('INFO.txt', ds.repo.get_indexed_files())
def test_install_dataset_from_instance(src, dst): origin = Dataset(src) clone = install(source=origin, path=dst) assert_is_instance(clone, Dataset) ok_startswith(clone.path, dst) ok_(clone.is_installed()) ok_(GitRepo.is_valid_repo(clone.path)) ok_clean_git(clone.path, annex=None) assert_in('INFO.txt', clone.repo.get_indexed_files())
def test_install_dataset_from_instance(src, dst): origin = Dataset(src) clone = install(source=origin, path=dst) assert_is_instance(clone, Dataset) ok_startswith(clone.path, dst) ok_(clone.is_installed()) ok_(GitRepo.is_valid_repo(clone.path)) ok_clean_git(clone.path, annex=None) assert_in('INFO.txt', clone.repo.get_indexed_files())
def __call__(path=None, dataset=None, fulfilled=None, recursive=False, recursion_limit=None, contains=None, bottomup=False, set_property=None, delete_property=None): # no constraints given -> query subdatasets under curdir if not path and dataset is None: path = os.curdir paths = [rev_resolve_path(p, dataset) for p in assure_list(path)] \ if path else None ds = require_dataset(dataset, check_installed=False, purpose='subdataset reporting/modification') lgr.debug('Query subdatasets of %s', dataset) if paths is not None: lgr.debug('Query subdatasets underneath paths: %s', paths) refds_path = ds.path # XXX this seems strange, but is tested to be the case -- I'd rather set # `check_installed` to true above and fail if not GitRepo.is_valid_repo(refds_path): return # return as quickly as possible if isinstance(recursion_limit, int) and (recursion_limit <= 0): return if set_property: for k, v in set_property: if valid_key.match(k) is None: raise ValueError( "key '%s' is invalid (alphanumeric plus '-' only, must " "start with a letter)" % k) if contains: contains = [ rev_resolve_path(c, dataset) for c in assure_list(contains) ] for r in _get_submodules(ds, paths, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): # a boat-load of ancient code consumes this and is ignorant of # Path objects r['path'] = text_type(r['path']) # without the refds_path cannot be rendered/converted relative # in the eval_results decorator r['refds'] = refds_path yield r
def test_install_dataset_from_just_source_via_path(url, path): # for remote urls only, the source could be given to `path` # to allows for simplistic cmdline calls # Q (ben): remote urls only? Sure? => TODO with chpwd(path, mkdir=True): ds = install(url) ok_startswith(ds.path, path) ok_(ds.is_installed()) ok_(GitRepo.is_valid_repo(ds.path)) ok_clean_git(ds.path, annex=None) assert_in('INFO.txt', ds.repo.get_indexed_files())
def test_install_dataset_from_instance(src=None, dst=None): origin = Dataset(src).create(result_renderer='disabled', force=True) origin.save(['INFO.txt', 'test.dat'], to_git=True) origin.save('test-annex.dat', to_git=False) clone = install(source=origin, path=dst) assert_is_instance(clone, Dataset) ok_startswith(clone.path, dst) ok_(clone.is_installed()) ok_(GitRepo.is_valid_repo(clone.path)) assert_repo_status(clone.path, annex=None) assert_in('INFO.txt', clone.repo.get_indexed_files())
def test_install_dataset_from_just_source_via_path(url, path): # for remote urls only, the source could be given to `path` # to allows for simplistic cmdline calls # Q (ben): remote urls only? Sure? => TODO with chpwd(path, mkdir=True): ds = install(url) ok_startswith(ds.path, path) ok_(ds.is_installed()) ok_(GitRepo.is_valid_repo(ds.path)) ok_clean_git(ds.path, annex=None) assert_in('INFO.txt', ds.repo.get_indexed_files())
def __call__( path=None, dataset=None, fulfilled=None, recursive=False, recursion_limit=None, contains=None, bottomup=False, set_property=None, delete_property=None): # no constraints given -> query subdatasets under curdir if not path and dataset is None: path = os.curdir paths = [rev_resolve_path(p, dataset) for p in assure_list(path)] \ if path else None ds = require_dataset( dataset, check_installed=False, purpose='subdataset reporting/modification') lgr.debug('Query subdatasets of %s', dataset) if paths is not None: lgr.debug('Query subdatasets underneath paths: %s', paths) refds_path = ds.path # XXX this seems strange, but is tested to be the case -- I'd rather set # `check_installed` to true above and fail if not GitRepo.is_valid_repo(refds_path): return # return as quickly as possible if isinstance(recursion_limit, int) and (recursion_limit <= 0): return if set_property: for k, v in set_property: if valid_key.match(k) is None: raise ValueError( "key '%s' is invalid (alphanumeric plus '-' only, must " "start with a letter)" % k) if contains: contains = [rev_resolve_path(c, dataset) for c in assure_list(contains)] for r in _get_submodules( ds, paths, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): # a boat-load of ancient code consumes this and is ignorant of # Path objects r['path'] = text_type(r['path']) # without the refds_path cannot be rendered/converted relative # in the eval_results decorator r['refds'] = refds_path yield r
def test_install_dataset_from_just_source_via_path(path=None): # for remote urls only, the source could be given to `path` # to allows for simplistic cmdline calls url = "https://github.com/datalad/testrepo--basic--r1.git" with chpwd(path, mkdir=True): ds = install(url) ok_startswith(ds.path, path) ok_(ds.is_installed()) ok_(GitRepo.is_valid_repo(ds.path)) assert_repo_status(ds.path, annex=None) assert_in('INFO.txt', ds.repo.get_indexed_files())
def func(arg, top, names): refpath, ignore, dirs = arg legit_names = [] for n in names: path = opj(top, n) if not isdir(path) or path in ignore: pass elif path != refpath and GitRepo.is_valid_repo(path): # mount point, keep but don't dive into dirs.append(path) else: legit_names.append(n) dirs.append(path) names[:] = legit_names
def _adj2subtrees(base, adj, subs): # given a set of parent-child mapping, compute a mapping of each parent # to all its (grand)children of any depth level subtrees = dict(adj) subs = set(subs) # from bottom up for ds in sorted(adj, reverse=True): subtree = [] for sub in subtrees[ds]: subtree.append(sub) subtree.extend(subtrees.get(sub, [])) subtrees[ds] = subtree # give each leaf dataset an entry too for sub in subs: if sub not in subtrees and GitRepo.is_valid_repo(sub): subtrees[sub] = [] return subtrees
def _adj2subtrees(base, adj, subs): # given a set of parent-child mapping, compute a mapping of each parent # to all its (grand)children of any depth level subtrees = dict(adj) subs = set(subs) # from bottom up for ds in sorted(adj, reverse=True): subtree = [] for sub in subtrees[ds]: subtree.append(sub) subtree.extend(subtrees.get(sub, [])) subtrees[ds] = subtree # give each leaf dataset an entry too for sub in subs: if sub not in subtrees and GitRepo.is_valid_repo(sub): subtrees[sub] = [] return subtrees
def download_dataset(repo=None, remote_path=None, local_folder=None, update_if_exists=False, unlock=False): assert HAVE_DATALAD, 'You need to install datalad' if repo is None: # print('Use gin NeuralEnsemble/ephy_testing_data') repo = 'https://gin.g-node.org/NeuralEnsemble/ephy_testing_data' if local_folder is None: base_local_folder = get_global_dataset_folder() base_local_folder.mkdir(exist_ok=True) # if not is_set_global_dataset_folder(): # print(f'Local folder is {base_local_folder}, Use set_global_dataset_folder() to set it globally') local_folder = base_local_folder / repo.split('/')[-1] if local_folder.exists() and GitRepo.is_valid_repo(local_folder): dataset = datalad.api.Dataset(path=local_folder) # make sure git repo is in clean state repo = dataset.repo if update_if_exists: repo.call_git(['checkout', '--force', 'master']) dataset.update(merge=True) else: dataset = datalad.api.install(path=local_folder, source=repo) if remote_path is None: print('Bad boy: you have to provide "remote_path"') return local_path = local_folder / remote_path dataset.get(remote_path) # unlocking is necessary for binding volume to containers if unlock: dataset.unlock(remote_path, recursive=True) return local_path
def test_clone_simple_local(src, path): origin = Dataset(path) # now install it somewhere else ds = clone(src, path, description='mydummy', result_xfm='datasets', return_type='item-or-list') eq_(ds.path, path) ok_(ds.is_installed()) if not isinstance(origin.repo, AnnexRepo): # this means it is a GitRepo ok_(isinstance(origin.repo, GitRepo)) # stays plain Git repo ok_(isinstance(ds.repo, GitRepo)) ok_(not isinstance(ds.repo, AnnexRepo)) ok_(GitRepo.is_valid_repo(ds.path)) eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt'}) assert_repo_status(path, annex=False) else: # must be an annex ok_(isinstance(ds.repo, AnnexRepo)) ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False)) eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt', 'test-annex.dat'}) assert_repo_status(path, annex=True) # no content was installed: ok_(not ds.repo.file_has_content('test-annex.dat')) uuid_before = ds.repo.uuid eq_(ds.repo.get_description(), 'mydummy') # installing it again, shouldn't matter: res = clone(src, path, result_xfm=None, return_type='list') assert_result_values_equal(res, 'source_url', [src]) assert_status('notneeded', res) assert_message("dataset %s was already cloned from '%s'", res) ok_(ds.is_installed()) if isinstance(origin.repo, AnnexRepo): eq_(uuid_before, ds.repo.uuid)
def _discover_subdatasets_recursively(top, trace, spec, recursion_limit): # this beast walks the directory tree from a give `top` directory # and discovers valid repos that are scattered around, regardless # of whether they are already subdatasets or not # for all found datasets it puts an entry into the SPEC and also # and entry with the path in the SPEC of the parent dataset if recursion_limit is not None and len(trace) > recursion_limit: return if not isdir(top): return if GitRepo.is_valid_repo(top): # found a repo, add the entire thing spec[top] = spec.get(top, []) + [top] # and to the parent if trace: spec[trace[-1]] = spec.get(trace[-1], []) + [top] trace = trace + [top] for path in listdir(top): path = opj(top, path) if not isdir(path): continue _discover_subdatasets_recursively(path, trace, spec, recursion_limit)
def _discover_subdatasets_recursively(top, trace, spec, recursion_limit): # this beast walks the directory tree from a give `top` directory # and discovers valid repos that are scattered around, regardless # of whether they are already subdatasets or not # for all found datasets it puts an entry into the SPEC and also # and entry with the path in the SPEC of the parent dataset if recursion_limit is not None and len(trace) > recursion_limit: return if not isdir(top): return if GitRepo.is_valid_repo(top): # found a repo, add the entire thing spec[top] = spec.get(top, []) + [top] # and to the parent if trace: spec[trace[-1]] = spec.get(trace[-1], []) + [top] trace = trace + [top] for path in listdir(top): path = opj(top, path) if not isdir(path): continue _discover_subdatasets_recursively(path, trace, spec, recursion_limit)
def _discover_trace_to_known(path, trace, spec): # this beast walks the directory tree from a given `path` until # it discoveres a known dataset (i.e. recorded in the spec) # if it finds one, it commits any accummulated trace of visited # datasets on this edge to the spec valid_repo = GitRepo.is_valid_repo(path) if valid_repo: trace = trace + [path] if path in spec: # found a known repo, commit the trace for i, p in enumerate(trace[:-1]): spec[p] = list(set(spec.get(p, []) + [trace[i + 1]])) # this edge is not done, we need to try to reach any downstream # dataset for p in listdir(path): if valid_repo and p == '.git': # ignore gitdir to steed things up continue p = opj(path, p) if not isdir(p): continue _discover_trace_to_known(p, trace, spec)
def __call__( dataset=None, fulfilled=None, recursive=False, recursion_limit=None, contains=None, bottomup=False, set_property=None, delete_property=None): dataset = require_dataset( dataset, check_installed=False, purpose='subdataset reporting/modification') refds_path = dataset.path # XXX this seems strange, but is tested to be the case -- I'd rather set # `check_installed` to true above and fail if not GitRepo.is_valid_repo(refds_path): return # return as quickly as possible if isinstance(recursion_limit, int) and (recursion_limit <= 0): return if set_property: for k, v in set_property: if valid_key.match(k) is None: raise ValueError( "key '%s' is invalid (alphanumeric plus '-' only, must start with a letter)", k) if contains: contains = resolve_path(contains, dataset) for r in _get_submodules( dataset.path, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): # without the refds_path cannot be rendered/converted relative # in the eval_results decorator r['refds'] = refds_path yield r
def __call__( dataset=None, fulfilled=None, recursive=False, recursion_limit=None, contains=None, bottomup=False, set_property=None, delete_property=None): dataset = require_dataset( dataset, check_installed=False, purpose='subdataset reporting/modification') refds_path = dataset.path # XXX this seems strange, but is tested to be the case -- I'd rather set # `check_installed` to true above and fail if not GitRepo.is_valid_repo(refds_path): return # return as quickly as possible if isinstance(recursion_limit, int) and (recursion_limit <= 0): return if set_property: for k, v in set_property: if valid_key.match(k) is None: raise ValueError( "key '%s' is invalid (alphanumeric plus '-' only, must start with a letter)", k) if contains: contains = resolve_path(contains, dataset) for r in _get_submodules( dataset.path, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): # without the refds_path cannot be rendered/converted relative # in the eval_results decorator r['refds'] = refds_path yield r
def _parse_git_submodules(dspath): """All known ones with some properties""" if not exists(opj(dspath, ".gitmodules")): # easy way out. if there is no .gitmodules file # we cannot have (functional) subdatasets return # this will not work in direct mode, need better way #1422 cmd = ['git', 'ls-files', '--stage', '-z'] # need to go rogue and cannot use proper helper in GitRepo # as they also pull in all of GitPython's magic try: stdout, stderr = GitRunner(cwd=dspath).run( cmd, log_stderr=True, log_stdout=True, # not sure why exactly, but log_online has to be false! log_online=False, expect_stderr=False, shell=False, # we don't want it to scream on stdout expect_fail=True) except CommandError as e: raise InvalidGitRepositoryError(exc_str(e)) for line in stdout.split('\0'): if not line or not line.startswith('160000'): continue sm = {} props = submodule_full_props.match(line) sm['revision'] = props.group(2) subpath = _path_(dspath, props.group(4)) sm['path'] = subpath if not exists(subpath) or not GitRepo.is_valid_repo(subpath): sm['state'] = 'absent' yield sm
def _parse_git_submodules(dspath): """All known ones with some properties""" if not exists(opj(dspath, ".gitmodules")): # easy way out. if there is no .gitmodules file # we cannot have (functional) subdatasets return # this will not work in direct mode, need better way #1422 cmd = ['git', 'ls-files', '--stage', '-z'] # need to go rogue and cannot use proper helper in GitRepo # as they also pull in all of GitPython's magic try: stdout, stderr = GitRunner(cwd=dspath).run( cmd, log_stderr=True, log_stdout=True, # not sure why exactly, but log_online has to be false! log_online=False, expect_stderr=False, shell=False, # we don't want it to scream on stdout expect_fail=True) except CommandError as e: raise InvalidGitRepositoryError(exc_str(e)) for line in stdout.split('\0'): if not line or not line.startswith('160000'): continue sm = {} props = submodule_full_props.match(line) sm['revision'] = props.group(2) subpath = _path_(dspath, props.group(4)) sm['path'] = subpath if not exists(subpath) or not GitRepo.is_valid_repo(subpath): sm['state'] = 'absent' yield sm
def is_installed(self): """Returns whether a dataset is installed. A dataset is installed when a repository for it exists on the filesystem. Returns ------- bool """ # do early check manually if path exists to not even ask git at all exists_now = exists(self.path) was_once_installed = None if exists_now: was_once_installed = self.path is not None and \ self.repo is not None if not exists_now or \ (was_once_installed and not GitRepo.is_valid_repo(self.path)): # repo gone now, reset self._repo = None return False else: return was_once_installed
def test_install_simple_local(src, path): origin = Dataset(path) # now install it somewhere else ds = install(path, source=src) eq_(ds.path, path) ok_(ds.is_installed()) if not isinstance(origin.repo, AnnexRepo): # this means it is a GitRepo ok_(isinstance(origin.repo, GitRepo)) # stays plain Git repo ok_(isinstance(ds.repo, GitRepo)) ok_(not isinstance(ds.repo, AnnexRepo)) ok_(GitRepo.is_valid_repo(ds.path)) eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt'}) ok_clean_git(path, annex=False) else: # must be an annex ok_(isinstance(ds.repo, AnnexRepo)) ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False)) eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt', 'test-annex.dat'}) ok_clean_git(path, annex=True) # no content was installed: ok_(not ds.repo.file_has_content('test-annex.dat')) uuid_before = ds.repo.uuid # installing it again, shouldn't matter: with swallow_logs(new_level=logging.INFO) as cml: ds = install(path, source=src) cml.assert_logged(msg="{0} was already installed from".format(ds), regex=False, level="INFO") ok_(ds.is_installed()) if isinstance(origin.repo, AnnexRepo): eq_(uuid_before, ds.repo.uuid)
def test_install_simple_local(src, path): origin = Dataset(path) # now install it somewhere else ds = install(path, source=src) eq_(ds.path, path) ok_(ds.is_installed()) if not isinstance(origin.repo, AnnexRepo): # this means it is a GitRepo ok_(isinstance(origin.repo, GitRepo)) # stays plain Git repo ok_(isinstance(ds.repo, GitRepo)) ok_(not isinstance(ds.repo, AnnexRepo)) ok_(GitRepo.is_valid_repo(ds.path)) eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt'}) ok_clean_git(path, annex=False) else: # must be an annex ok_(isinstance(ds.repo, AnnexRepo)) ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False)) eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt', 'test-annex.dat'}) ok_clean_git(path, annex=True) # no content was installed: ok_(not ds.repo.file_has_content('test-annex.dat')) uuid_before = ds.repo.uuid # installing it again, shouldn't matter: with swallow_logs(new_level=logging.INFO) as cml: ds = install(path, source=src) cml.assert_logged(msg="{0} was already installed from".format(ds), regex=False, level="INFO") ok_(ds.is_installed()) if isinstance(origin.repo, AnnexRepo): eq_(uuid_before, ds.repo.uuid)
def __call__( path=None, dataset=None, to_git=False, save=True, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path") # never recursion, need to handle manually below to be able to # discover untracked content content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=False) if unavailable_paths: lgr.warning("ignoring non-existent path(s): %s", unavailable_paths) if recursive: # with --recursive for each input path traverse the directory # tree, when we find a dataset, add it to the spec, AND add it as # a path to the spec of the parent # MIH: wrap in list() to avoid exception, because dict size might # change, but we want to loop over all that are in at the start # only for d in list(content_by_ds.keys()): for p in content_by_ds[d]: _discover_subdatasets_recursively( p, [d], content_by_ds, recursion_limit) if not content_by_ds: raise InsufficientArgumentsError( "no existing content given to add") if dataset: # remeber the datasets associated with actual inputs input_ds = list(content_by_ds.keys()) # forge chain from base dataset to any leaf dataset _discover_trace_to_known(dataset.path, [], content_by_ds) if ds2super: # now check all dataset entries corresponding to the original # input to see if they contain their own paths and remove them for inpds in input_ds: content_by_ds[inpds] = [p for p in content_by_ds[inpds] if not p == inpds] # and lastly remove all entries that contain no path to avoid # saving any staged content in the final step content_by_ds = {d: v for d, v in content_by_ds.items() if v} results = [] # simple loop over datasets -- save happens later # start deep down for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) toadd = list(set(content_by_ds[ds_path])) # handle anything that looks like a wannabe subdataset for subds_path in [d for d in toadd if GitRepo.is_valid_repo(d) and d != ds_path and d not in ds.get_subdatasets( recursive=False, absolute=True, fulfilled=True)]: # TODO add check that the subds has a commit, and refuse # to operate on it otherwise, or we would get a bastard # submodule that cripples git operations _install_subds_inplace( ds=ds, path=subds_path, relativepath=relpath(subds_path, ds_path)) # make sure that .gitmodules is added to the list of files toadd.append(opj(ds.path, '.gitmodules')) # report added subdatasets -- add below won't do it results.append({ 'success': True, 'file': Dataset(subds_path)}) # make sure any last minute additions make it to the saving stage content_by_ds[ds_path] = toadd added = ds.repo.add( toadd, git=to_git if isinstance(ds.repo, AnnexRepo) else True, commit=False) for a in added: a['file'] = opj(ds_path, a['file']) results.extend(added) if results and save: save_dataset_hierarchy( content_by_ds, base=dataset.path if dataset and dataset.is_installed() else None, message='[DATALAD] added content') return results
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, check=True, if_dirty='save-before'): if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `drop`: requires at least a path or dataset") refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='drop', logger=lgr, refds=refds_path) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path to_drop = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='drop', # justification for status: # content need not be dropped where there is none unavailable_path_status='notneeded', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: ap['process_content'] = True if ap.get('registered_subds', False) and ap.get('state', None) == 'absent': # nothing to drop in an absent subdataset, don't be annoying # and skip silently continue to_drop.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_drop, refds_path=refds_path) assert(not completed) # iterate over all datasets, order doesn't matter for ds_path in content_by_ds: ds = Dataset(ds_path) # TODO generator # this should yield what it did handle_dirty_dataset(ds, mode=if_dirty) # ignore submodule entries content = [ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path] if not content: continue for r in _drop_files(ds, content, check=check, **res_kwargs): yield r
def __call__( path=None, dataset=None, recursive=False, check=True, save=True, message=None, if_dirty='save-before'): res_kwargs = dict(action='remove', logger=lgr) if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `remove`: requires at least a path or dataset") refds_path = Interface.get_refds_path(dataset) res_kwargs['refds'] = refds_path if refds_path and not path and not GitRepo.is_valid_repo(refds_path): # nothing here, nothing to remove yield get_status_dict(path=refds_path, status='notneeded', **res_kwargs) return if refds_path and not path: # act on the whole dataset if nothing else was specified # TODO i think that would happen automatically in annotation? path = refds_path to_process = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, # we only ever want to discover immediate subdatasets, the rest # will happen in `uninstall` recursion_limit=1, action='remove', unavailable_path_status='', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('state', None) == 'absent' and \ ap.get('parentds', None) is None: # nothing exists at location, and there is no parent to # remove from ap['status'] = 'notneeded' ap['message'] = "path does not exist and is not in a dataset" yield ap continue if ap.get('raw_input', False) and ap.get('type', None) == 'dataset': # make sure dataset sorting yields a dedicted entry for this one ap['process_content'] = True to_process.append(ap) if not to_process: # nothing left to do, potentially all errored before return if path_is_under([ap['path'] for ap in to_process]): # behave like `rm` and refuse to remove where we are raise ValueError( "refusing to uninstall current or parent directory") # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert(not completed) # iterate over all datasets, starting at the bottom # to make the removal of dataset content known upstairs to_save = [] # track which submodules we have removed in the process, to avoid # failure in case we revisit them due to a subsequent path argument subm_removed = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) paths = content_by_ds[ds_path] to_reporemove = dict() # PLAN any dataset that was not raw_input, uninstall (passing recursive flag) # if dataset itself is in paths, skip any nondataset # sort reverse so we get subdatasets first for ap in sorted(paths, key=lambda x: x['path'], reverse=True): if ap.get('type', None) == 'dataset': # entire dataset needs to go, uninstall if present, pass recursive! uninstall_failed = False if ap['path'] == refds_path or \ (refds_path is None and ap.get('raw_input', False)): # top-level handling, cannot use regular uninstall call, as # it will refuse to uninstall a top-level dataset # and rightfully so, it is really a remove in that case # bypass all the safety by using low-level helper for r in _uninstall_dataset(ds, check=check, has_super=False, **res_kwargs): if r['status'] in ('impossible', 'error'): # we need to inspect if something went wrong, in order # to prevent failure from removing a non-empty dir below, # but at the same time allow for continued processing uninstall_failed = True r['refds'] = refds_path yield r # recheck that it wasn't removed during a previous iteration elif ap.get('state', None) != 'absent' and GitRepo.is_valid_repo(ap['path']): # anything that is not the top-level -> regular uninstall # this is for subdatasets of the to-be-removed dataset # we want to simply uninstall them in a regular manner for r in Uninstall.__call__( # use annotate path as input, but pass a copy because # we cannot rely on it being unaltered by reannotation # TODO maybe adjust annotate_path to do that [ap.copy()], dataset=refds_path, recursive=recursive, check=check, if_dirty=if_dirty, result_xfm=None, result_filter=None, on_failure='ignore'): if r['status'] in ('impossible', 'error'): # we need to inspect if something went wrong, in order # to prevent failure from removing a non-empty dir below, # but at the same time allow for continued processing uninstall_failed = True yield r if not ap.get('raw_input', False): # we only ever want to actually unregister subdatasets that # were given explicitly continue if not uninstall_failed and \ not ap['path'] in subm_removed and \ refds_path and \ ap.get('parentds', None) and \ not (relpath(ap['path'], start=refds_path).startswith(pardir) or ap['path'] == refds_path) and \ ap.get('registered_subds', False): # strip from superdataset, but only if a dataset was given explcitly # as in "remove from this dataset", but not when just a path was given # as in "remove from the filesystem" subds_relpath = relpath(ap['path'], start=ap['parentds']) # remove submodule reference parentds = Dataset(ap['parentds']) # play safe, will fail on dirty parentds.repo.deinit_submodule(ap['path']) # remove now empty submodule link parentds.repo.remove(ap['path']) # make a record that we removed this already, should it be # revisited via another path argument, because do not reannotate # the paths after every removal subm_removed.append(ap['path']) yield dict(ap, status='ok', **res_kwargs) # need .gitmodules update in parent to_save.append(dict( path=opj(parentds.path, '.gitmodules'), parents=parentds.path, type='file')) # and the removal itself needs to be committed # inform `save` that it is OK that this path # doesn't exist on the filesystem anymore ap['unavailable_path_status'] = '' ap['process_content'] = False to_save.append(ap) if not uninstall_failed and exists(ap['path']): # could be an empty dir in case an already uninstalled subdataset # got removed rmdir(ap['path']) else: # anything that is not a dataset can simply be passed on to_reporemove[ap['path']] = ap # avoid unnecessary git calls when there is nothing to do if to_reporemove: if check and hasattr(ds.repo, 'drop'): for r in _drop_files(ds, list(to_reporemove), check=True): if r['status'] == 'error': # if drop errored on that path, we can't remove it to_reporemove.pop(r['path'], 'avoidKeyError') yield r if to_reporemove: for r in ds.repo.remove(list(to_reporemove), r=True): # these were removed, but we still need to save the # removal r_abs = opj(ds.path, r) if r_abs in to_reporemove: ap = to_reporemove[r_abs] else: ap = {'path': r_abs, 'parentds': ds.path, 'refds': refds_path } ap['unavailable_path_status'] = '' to_save.append(ap) yield get_status_dict( status='ok', path=r, **res_kwargs) if not to_save: # nothing left to do, potentially all errored before return if not save: lgr.debug('Not calling `save` as instructed') return for res in Save.__call__( # TODO compose hand-selected annotated paths path=to_save, # we might have removed the reference dataset by now, recheck dataset=refds_path if (refds_path and GitRepo.is_valid_repo(refds_path)) else None, message=message if message else '[DATALAD] removed content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__( path=None, dataset=None, get_aggregates=False, reporton='all', recursive=False): # prep results refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='metadata', logger=lgr) if refds_path: res_kwargs['refds'] = refds_path if get_aggregates: # yield all datasets for which we have aggregated metadata as results # the get actual dataset results, so we can turn them into dataset # instances using generic top-level code if desired ds = require_dataset( refds_path, check_installed=True, purpose='aggregate metadata query') agginfos = load_ds_aggregate_db( ds, version=str(aggregate_layout_version), abspath=True ) if not agginfos: # if there has ever been an aggregation run, this file would # exist, hence there has not been and we need to tell this # to people yield get_status_dict( ds=ds, status='impossible', action='metadata', logger=lgr, message='metadata aggregation has never been performed in this dataset') return parentds = [] for dspath in sorted(agginfos): info = agginfos[dspath] if parentds and not path_is_subpath(dspath, parentds[-1]): parentds.pop() info.update( path=dspath, type='dataset', status='ok', ) if dspath == ds.path: info['layout_version'] = aggregate_layout_version if parentds: info['parentds'] = parentds[-1] yield dict( info, **res_kwargs ) parentds.append(dspath) return if not dataset and not path: # makes no sense to have no dataset, go with "here" # error generation happens during annotation path = op.curdir content_by_ds = OrderedDict() for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, # MIH: we are querying the aggregated metadata anyways, and that # mechanism has its own, faster way to go down the hierarchy #recursive=recursive, #recursion_limit=recursion_limit, action='metadata', # uninstalled subdatasets could be queried via aggregated metadata # -> no 'error' unavailable_path_status='', nondataset_path_status='error', # we need to know when to look into aggregated data force_subds_discovery=True, force_parentds_discovery=True, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo(ap['path']): ap['process_content'] = True to_query = None if ap.get('state', None) == 'absent' or \ ap.get('type', 'dataset') != 'dataset': # this is a lonely absent dataset/file or content in a present dataset # -> query through parent # there must be a parent, otherwise this would be a non-dataset path # and would have errored during annotation to_query = ap['parentds'] else: to_query = ap['path'] if to_query: pcontent = content_by_ds.get(to_query, []) pcontent.append(ap) content_by_ds[to_query] = pcontent for ds_path in content_by_ds: ds = Dataset(ds_path) query_agg = [ap for ap in content_by_ds[ds_path] # this is an available subdataset, will be processed in another # iteration if ap.get('state', None) == 'absent' or not(ap.get('type', None) == 'dataset' and ap['path'] != ds_path)] if not query_agg: continue # report from aggregated metadata for r in query_aggregated_metadata( reporton, # by default query the reference dataset, only if there is none # try our luck in the dataset that contains the queried path # this is consistent with e.g. `get_aggregates` reporting the # situation in the reference dataset only Dataset(refds_path) if refds_path else ds, query_agg, # recursion above could only recurse into datasets # on the filesystem, but there might be any number of # uninstalled datasets underneath the last installed one # for which we might have metadata recursive=recursive, **res_kwargs): yield r return
def discover_dataset_trace_to_targets(basepath, targetpaths, current_trace, spec, includeds=None): """Discover the edges and nodes in a dataset tree to given target paths Parameters ---------- basepath : path Path to a start or top-level dataset. Really has to be a path to a dataset! targetpaths : list(path) Any non-zero number of paths that are termination points for the search algorithm. Can be paths to datasets, directories, or files (and any combination thereof). current_trace : list For a top-level call this should probably always be `[]` spec : dict `content_by_ds`-style dictionary that will receive information about the discovered datasets. Specifically, for each discovered dataset there will be in item with its path under the key (path) of the respective superdataset. includeds : sequence, optional Any paths given are treated as existing subdatasets, regardless of whether they can be found in the filesystem. Such subdatasets will appear under the key of the closest existing dataset in the `spec`. Returns ------- None Function calls itself recursively and populates `spec` dict in-place. Keys are dataset paths, values are sets of subdataset paths """ # convert to set for faster lookup includeds = includeds if isinstance(includeds, set) else \ set() if includeds is None else set(includeds) # this beast walks the directory tree from a given `basepath` until # it discovers any of the given `targetpaths` # if it finds one, it commits any accummulated trace of visited # datasets on this edge to the spec valid_repo = GitRepo.is_valid_repo(basepath) if valid_repo: # we are passing into a new dataset, extend the dataset trace current_trace = current_trace + [basepath] # this edge is not done, we need to try to reach any downstream # dataset undiscovered_ds = set(t for t in targetpaths) # if t != basepath) # whether anything in this directory matched a targetpath filematch = False if isdir(basepath): for p in listdir(basepath): p = assure_unicode(opj(basepath, p)) if not isdir(p): if p in targetpaths: filematch = True # we cannot have anything below this one continue # OPT listdir might be large and we could have only few items # in `targetpaths` -- so traverse only those in spec which have # leading dir basepath # filter targets matching this downward path downward_targets = set( t for t in targetpaths if path_startswith(t, p)) if not downward_targets: continue # remove the matching ones from the "todo" list undiscovered_ds.difference_update(downward_targets) # go one deeper discover_dataset_trace_to_targets( p, downward_targets, current_trace, spec, includeds=includeds if not includeds else includeds.intersection( downward_targets)) undiscovered_ds = [t for t in undiscovered_ds if includeds and path_is_subpath(t, current_trace[-1]) and t in includeds] if filematch or basepath in targetpaths or undiscovered_ds: for i, p in enumerate(current_trace[:-1]): # TODO RF prepare proper annotated path dicts subds = spec.get(p, set()) subds.add(current_trace[i + 1]) spec[p] = subds if undiscovered_ds: spec[current_trace[-1]] = spec.get(current_trace[-1], set()).union( undiscovered_ds)
def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, description=None, reckless=False, #git_opts=None, #annex_opts=None, #annex_get_opts=None, jobs='auto', verbose=False, ): # IMPLEMENTATION CONCEPT: # # 1. Sort the world into existing handles and the rest # 2. Try locate missing handles (obtain subdatasets along the way) # 3. Expand into subdatasets with recursion enables (potentially # obtain even more subdatasets # 4. Shoot info of which handles to get in each subdataset to, # git-annex, once at the very end refds_path = Interface.get_refds_path(dataset) if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path # remember which results we already reported, to avoid duplicates yielded_ds = [] to_get = [] unavailable_paths = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='get', # NOTE: Do not act upon unavailable paths yet! Done below after # testing which ones could be obtained unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('state', None) == 'absent' and ap.get('raw_input', False): # if this wasn't found, but directly requested, queue for further # exploration unavailable_paths.append(ap) continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: # do not report what hasn't arived yet # also do not report the base dataset that is already # present -- no surprise yield dict(ap, status='notneeded', logger=lgr, message='already installed') yielded_ds.append(ap['path']) ap['process_content'] = get_data to_get.append(ap) # explore the unknown for ap in sorted(unavailable_paths, key=lambda x: x['path']): lgr.debug("Investigate yet unavailable path %s", ap) # how close can we get? dspath = ap.get('parentds', get_dataset_root(ap['path'])) if dspath is None: # nothing we can do for this path continue lgr.debug("Found containing dataset %s for path %s", dspath, ap['path']) ds = Dataset(dspath) # now actually obtain whatever is necessary to get to this path containing_ds = [dspath] for res in _install_necessary_subdatasets( ds, ap['path'], reckless, refds_path, description=description): # yield immediately so errors could be acted upon outside, before # we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record, recursive below might now want to report # a 'notneeded' yielded_ds.append(res['path']) yield res # update to the current innermost dataset containing_ds.append(res['path']) if len(containing_ds) < 2: # no subdataset was installed, hence if the path was unavailable # before it still is, no need to bother git annex ap.update(status='impossible', message='path does not exist') yield ap continue # important to only do the next for the innermost subdataset # as the `recursive` logic below relies on that! # set the correct parent, for a dataset this would be the second-last # reported subdataset ap.update(parentds=containing_ds[-1]) if containing_ds[-1] == ap['path']: # the path actually refers to the last installed dataset ap.update(parentds=containing_ds[-2], process_content=get_data, type='dataset') to_get.append(ap) # results of recursive installation of yet undiscovered datasets rec_get = [] if recursive and not recursion_limit == 'existing': # obtain any subdatasets underneath the paths given inside the # subdatasets that we know already exist # unless we do not want recursion into not-yet-installed datasets for ap in sorted(to_get, key=lambda x: x['path']): if ap['type'] not in ('dataset', 'directory') or not ap.get('raw_input', False): # a non-directory cannot have content underneath # also we do NOT want to recurse into anything that was specifically # requested, to avoid duplication continue subds = Dataset(ap['path'] if ap['type'] == 'dataset' else ap['parentds']) lgr.info( "Installing %s%s recursively", subds, (" underneath %s" % ap['path'] if subds.path != ap['path'] else "")) for res in _recursive_install_subds_underneath( subds, # `ap['path']` was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, start=ap['path'], refds_path=refds_path, description=description): # yield immediately so errors could be acted upon # outside, before we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record yielded_ds.append(res['path']) yield res if not (res['status'] == 'ok' and res['type'] == 'dataset'): # not a dataset that was just installed, we just reported it # upstairs, and can ignore it from now on continue # paranoia, so popular these days... assert GitRepo.is_valid_repo(res['path']) # keep a copy of the install record for `get` later on get_ap = {k: v for k, v in res.items() if not k == 'status'} get_ap['process_content'] = get_data rec_get.append(get_ap) if not get_data: # done already return # merge the two AP lists to_get.extend(rec_get) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_get, refds_path=refds_path) assert(not completed) # hand over to git-annex, get files content, # report files in git as 'notneeded' to get for ds_path in sorted(content_by_ds.keys()): ds = Dataset(ds_path) # grab content, ignore subdataset entries content = [ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path] if not content: # cut this short should there be nothing continue # needs to be an annex to get content if not isinstance(ds.repo, AnnexRepo): for r in results_from_paths( content, status='notneeded', message="no dataset annex, content already present", action='get', logger=lgr, refds=refds_path): yield r continue respath_by_status = {} for res in ds.repo.get( content, options=['--from=%s' % source] if source else [], jobs=jobs): res = annexjson2result(res, ds, type='file', logger=lgr, refds=refds_path) success = success_status_map[res['status']] # TODO: in case of some failed commands (e.g. get) there might # be no path in the record. yoh has only vague idea of logic # here so just checks for having 'path', but according to # results_from_annex_noinfo, then it would be assumed that # `content` was acquired successfully, which is not the case if 'path' in res: respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res for r in results_from_annex_noinfo( ds, content, respath_by_status, dir_fail_msg='could not get some content in %s %s', noinfo_dir_msg='nothing to get from %s', noinfo_file_msg='already present', action='get', logger=lgr, refds=refds_path): yield r
def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, description=None, reckless=False, #git_opts=None, #annex_opts=None, #annex_get_opts=None, jobs='auto', verbose=False, ): # IMPLEMENTATION CONCEPT: # # 1. Sort the world into existing handles and the rest # 2. Try locate missing handles (obtain subdatasets along the way) # 3. Expand into subdatasets with recursion enables (potentially # obtain even more subdatasets # 4. Shoot info of which handles to get in each subdataset to, # git-annex, once at the very end refds_path = Interface.get_refds_path(dataset) if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path # remember which results we already reported, to avoid duplicates yielded_ds = [] to_get = [] unavailable_paths = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='get', # NOTE: Do not act upon unavailable paths yet! Done below after # testing which ones could be obtained unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('state', None) == 'absent' and ap.get( 'raw_input', False): # if this wasn't found, but directly requested, queue for further # exploration unavailable_paths.append(ap) continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: # do not report what hasn't arived yet # also do not report the base dataset that is already # present -- no surprise yield dict(ap, status='notneeded', logger=lgr, message='already installed') yielded_ds.append(ap['path']) ap['process_content'] = get_data to_get.append(ap) # explore the unknown for ap in sorted(unavailable_paths, key=lambda x: x['path']): lgr.debug("Investigate yet unavailable path %s", ap) # how close can we get? dspath = ap.get('parentds', get_dataset_root(ap['path'])) if dspath is None: # nothing we can do for this path continue lgr.debug("Found containing dataset %s for path %s", dspath, ap['path']) ds = Dataset(dspath) # now actually obtain whatever is necessary to get to this path containing_ds = [dspath] for res in _install_necessary_subdatasets(ds, ap['path'], reckless, refds_path, description=description): # yield immediately so errors could be acted upon outside, before # we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record, recursive below might now want to report # a 'notneeded' yielded_ds.append(res['path']) yield res # update to the current innermost dataset containing_ds.append(res['path']) if len(containing_ds) < 2: # no subdataset was installed, hence if the path was unavailable # before it still is, no need to bother git annex ap.update(status='impossible', message='path does not exist') yield ap continue # important to only do the next for the innermost subdataset # as the `recursive` logic below relies on that! # set the correct parent, for a dataset this would be the second-last # reported subdataset ap.update(parentds=containing_ds[-1]) if containing_ds[-1] == ap['path']: # the path actually refers to the last installed dataset ap.update(parentds=containing_ds[-2], process_content=get_data, type='dataset') to_get.append(ap) # results of recursive installation of yet undiscovered datasets rec_get = [] if recursive and not recursion_limit == 'existing': # obtain any subdatasets underneath the paths given inside the # subdatasets that we know already exist # unless we do not want recursion into not-yet-installed datasets for ap in sorted(to_get, key=lambda x: x['path']): if ap['type'] not in ('dataset', 'directory') or not ap.get( 'raw_input', False): # a non-directory cannot have content underneath # also we do NOT want to recurse into anything that was specifically # requested, to avoid duplication continue subds = Dataset(ap['path'] if ap['type'] == 'dataset' else ap['parentds']) lgr.info("Installing %s%s recursively", subds, (" underneath %s" % ap['path'] if subds.path != ap['path'] else "")) for res in _recursive_install_subds_underneath( subds, # `ap['path']` was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, start=ap['path'], refds_path=refds_path, description=description): # yield immediately so errors could be acted upon # outside, before we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record yielded_ds.append(res['path']) yield res if not (res['status'] == 'ok' and res['type'] == 'dataset'): # not a dataset that was just installed, we just reported it # upstairs, and can ignore it from now on continue # paranoia, so popular these days... assert GitRepo.is_valid_repo(res['path']) # keep a copy of the install record for `get` later on get_ap = { k: v for k, v in res.items() if not k == 'status' } get_ap['process_content'] = get_data rec_get.append(get_ap) if not get_data: # done already return # merge the two AP lists to_get.extend(rec_get) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_get, refds_path=refds_path) assert (not completed) # hand over to git-annex, get files content, # report files in git as 'notneeded' to get for ds_path in sorted(content_by_ds.keys()): ds = Dataset(ds_path) # grab content, ignore subdataset entries content = [ ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path ] if not content: # cut this short should there be nothing continue # needs to be an annex to get content if not isinstance(ds.repo, AnnexRepo): for r in results_from_paths( content, status='notneeded', message="no dataset annex, content already present", action='get', logger=lgr, refds=refds_path): yield r continue respath_by_status = {} for res in ds.repo.get(content, options=['--from=%s' % source] if source else [], jobs=jobs): res = annexjson2result(res, ds, type='file', logger=lgr, refds=refds_path) success = success_status_map[res['status']] # TODO: in case of some failed commands (e.g. get) there might # be no path in the record. yoh has only vague idea of logic # here so just checks for having 'path', but according to # results_from_annex_noinfo, then it would be assumed that # `content` was acquired successfully, which is not the case if 'path' in res: respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res for r in results_from_annex_noinfo( ds, content, respath_by_status, dir_fail_msg='could not get some content in %s %s', noinfo_dir_msg='nothing to get from %s', noinfo_file_msg='already present', action='get', logger=lgr, refds=refds_path): yield r
def __call__(path=None, dataset=None, get_aggregates=False, reporton='all', recursive=False): # prep results refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='metadata', logger=lgr) if refds_path: res_kwargs['refds'] = refds_path if get_aggregates: # yield all datasets for which we have aggregated metadata as results # the get actual dataset results, so we can turn them into dataset # instances using generic top-level code if desired ds = require_dataset(refds_path, check_installed=True, purpose='aggregate metadata query') info_fpath = opj(ds.path, agginfo_relpath) if not exists(info_fpath): # if there has ever been an aggregation run, this file would # exist, hence there has not been and we need to tell this # to people yield get_status_dict( ds=ds, status='impossible', action='metadata', logger=lgr, message= 'metadata aggregation has never been performed in this dataset' ) return agginfos = _load_json_object(info_fpath) parentds = [] for sd in sorted(agginfos): info = agginfos[sd] dspath = normpath(opj(ds.path, sd)) if parentds and not path_is_subpath(dspath, parentds[-1]): parentds.pop() info.update( path=dspath, type='dataset', status='ok', ) if sd == curdir: info['layout_version'] = aggregate_layout_version if parentds: info['parentds'] = parentds[-1] yield dict(info, **res_kwargs) parentds.append(dspath) return if not dataset and not path: # makes no sense to have no dataset, go with "here" # error generation happens during annotation path = curdir content_by_ds = OrderedDict() for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, # MIH: we are querying the aggregated metadata anyways, and that # mechanism has its own, faster way to go down the hierarchy #recursive=recursive, #recursion_limit=recursion_limit, action='metadata', # uninstalled subdatasets could be queried via aggregated metadata # -> no 'error' unavailable_path_status='', nondataset_path_status='error', # we need to know when to look into aggregated data force_subds_discovery=True, force_parentds_discovery=True, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo( ap['path']): ap['process_content'] = True to_query = None if ap.get('state', None) == 'absent' or \ ap.get('type', 'dataset') != 'dataset': # this is a lonely absent dataset/file or content in a present dataset # -> query through parent # there must be a parent, otherwise this would be a non-dataset path # and would have errored during annotation to_query = ap['parentds'] else: to_query = ap['path'] if to_query: pcontent = content_by_ds.get(to_query, []) pcontent.append(ap) content_by_ds[to_query] = pcontent for ds_path in content_by_ds: ds = Dataset(ds_path) query_agg = [ ap for ap in content_by_ds[ds_path] # this is an available subdataset, will be processed in another # iteration if ap.get('state', None) == 'absent' or not (ap.get( 'type', None) == 'dataset' and ap['path'] != ds_path) ] if not query_agg: continue # report from aggregated metadata for r in query_aggregated_metadata( reporton, # by default query the reference dataset, only if there is none # try our luck in the dataset that contains the queried path # this is consistent with e.g. `get_aggregates` reporting the # situation in the reference dataset only Dataset(refds_path) if refds_path else ds, query_agg, # recursion above could only recurse into datasets # on the filesystem, but there might be any number of # uninstalled datasets underneath the last installed one # for which we might have metadata recursive=recursive, **res_kwargs): yield r return
def repo(self): """Get an instance of the version control system/repo for this dataset, or None if there is none yet (or none anymore). If testing the validity of an instance of GitRepo is guaranteed to be really cheap this could also serve as a test whether a repo is present. Note, that this property is evaluated every time it is used. If used multiple times within a function it's probably a good idea to store its value in a local variable and use this variable instead. Returns ------- GitRepo or AnnexRepo """ # If we already got a *Repo instance, check whether it's still valid; # Note, that this basically does part of the testing that would # (implicitly) be done in the loop below again. So, there's still # potential to speed up when we actually need to get a new instance # (or none). But it's still faster for the vast majority of cases. # # TODO: Dig deeper into it and melt with new instance guessing. This # should also involve to reduce redundancy of testing such things from # within Flyweight.__call__, AnnexRepo.__init__ and GitRepo.__init__! # # Also note, that this could be forged into a single big condition, but # that is hard to read and we should be well aware of the actual # criteria here: if self._repo is not None and realpath(self.path) == self._repo.path: # we got a repo and path references still match if isinstance(self._repo, AnnexRepo): # it's supposed to be an annex if self._repo is AnnexRepo._unique_instances.get( self._repo.path, None) and \ AnnexRepo.is_valid_repo(self._repo.path, allow_noninitialized=True): # it's still the object registered as flyweight and it's a # valid annex repo return self._repo elif isinstance(self._repo, GitRepo): # it's supposed to be a plain git if self._repo is GitRepo._unique_instances.get( self._repo.path, None) and \ GitRepo.is_valid_repo(self._repo.path) and not \ self._repo.is_with_annex(): # it's still the object registered as flyweight, it's a # valid git repo and it hasn't turned into an annex return self._repo # Note: Although it looks like the "self._repo = None" assignments # could be used instead of variable "valid", that's a big difference! # The *Repo instances are flyweights, not singletons. self._repo might # be the last reference, which would lead to those objects being # destroyed and therefore the constructor call would result in an # actually new instance. This is unnecessarily costly. valid = False for cls, ckw, kw in ( # TODO: Do we really want to allow_noninitialized=True here? # And if so, leave a proper comment! (AnnexRepo, {'allow_noninitialized': True}, {'init': False}), (GitRepo, {}, {}) ): if cls.is_valid_repo(self._path, **ckw): try: lgr.log(5, "Detected %s at %s", cls, self._path) self._repo = cls(self._path, create=False, **kw) valid = True break except (InvalidGitRepositoryError, NoSuchPathError, InvalidAnnexRepositoryError) as exc: lgr.log(5, "Oops -- guess on repo type was wrong?: %s", exc_str(exc)) if not valid: self._repo = None if self._repo is None: # Often .repo is requested to 'sense' if anything is installed # under, and if so -- to proceed forward. Thus log here only # at DEBUG level and if necessary "complaint upstairs" lgr.log(5, "Failed to detect a valid repo at %s", self.path) return self._repo
def _get_submodules(ds, paths, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): dspath = ds.path if not GitRepo.is_valid_repo(dspath): return modinfo = _parse_gitmodules(ds) # put in giant for-loop to be able to yield results before completion for sm in _parse_git_submodules(ds, paths): if contains and not any( sm['path'] == c or sm['path'] in c.parents for c in contains): # we are not looking for this subds, because it doesn't # match the target path continue # do we just need this to recurse into subdatasets, or is this a # real results? to_report = paths is None \ or any(p == sm['path'] or p in sm['path'].parents for p in paths) sm.update(modinfo.get(sm['path'], {})) if to_report and (set_property or delete_property): # first deletions for dprop in assure_list(delete_property): try: out, err = ds.repo._git_custom_command( '', ['git', 'config', '--file', '.gitmodules', '--unset-all', 'submodule.{}.{}'.format(sm['gitmodule_name'], dprop), ] ) except CommandError: yield get_status_dict( 'subdataset', status='impossible', message=( "Deleting subdataset property '%s' failed for " "subdataset '%s', possibly did " "not exist", dprop, sm['gitmodule_name']), logger=lgr, **sm) # also kick from the info we just read above sm.pop('gitmodule_{}'.format(dprop), None) # and now setting values for sprop in assure_list(set_property): prop, val = sprop if val.startswith('<') and val.endswith('>') and '{' in val: # expand template string val = val[1:-1].format( **dict( sm, refds_relpath=sm['path'].relative_to(refds_path), refds_relname=text_type( sm['path'].relative_to(refds_path) ).replace(os.sep, '-'))) try: out, err = ds.repo._git_custom_command( '', ['git', 'config', '--file', '.gitmodules', '--replace-all', 'submodule.{}.{}'.format(sm['gitmodule_name'], prop), text_type(val), ] ) except CommandError as e: # pragma: no cover # this conditional may not be possible to reach, as # variable name validity is checked before and Git # replaces the file completely, resolving any permission # issues, if the file could be read (already done above) yield get_status_dict( 'subdataset', status='error', message=( "Failed to set property '%s': %s", prop, exc_str(e)), type='dataset', logger=lgr, **sm) # it is up to parent code to decide whether we would continue # after this # also add to the info we just read above sm['gitmodule_{}'.format(prop)] = val Dataset(dspath).save( '.gitmodules', to_git=True, message='[DATALAD] modified subdataset properties') #common = commonprefix((with_pathsep(subds), with_pathsep(path))) #if common.endswith(sep) and common == with_pathsep(subds): # candidates.append(common) subdsres = get_status_dict( 'subdataset', status='ok', type='dataset', logger=lgr) subdsres.update(sm) subdsres['parentds'] = dspath if to_report and (not bottomup and \ (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled)): yield subdsres # expand list with child submodules. keep all paths relative to parent # and convert jointly at the end if recursive and \ (recursion_limit in (None, 'existing') or (isinstance(recursion_limit, int) and recursion_limit > 1)): for r in _get_submodules( Dataset(sm['path']), paths, fulfilled, recursive, (recursion_limit - 1) if isinstance(recursion_limit, int) else recursion_limit, contains, bottomup, set_property, delete_property, refds_path): yield r if to_report and (bottomup and \ (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled)): yield subdsres
def _get_submodules(dspath, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): if not GitRepo.is_valid_repo(dspath): return modinfo = _parse_gitmodules(dspath) # write access parser parser = None # TODO bring back in more global scope from below once segfaults are # figured out #if set_property or delete_property: # gitmodule_path = opj(dspath, ".gitmodules") # parser = GitConfigParser( # gitmodule_path, read_only=False, merge_includes=False) # parser.read() # put in giant for-loop to be able to yield results before completion for sm in _parse_git_submodules(dspath): if contains and not path_startswith(contains, sm['path']): # we are not looking for this subds, because it doesn't # match the target path continue sm.update(modinfo.get(sm['path'], {})) if set_property or delete_property: gitmodule_path = opj(dspath, ".gitmodules") parser = GitConfigParser( gitmodule_path, read_only=False, merge_includes=False) parser.read() # do modifications now before we read the info out for reporting # use 'submodule "NAME"' section ID style as this seems to be the default submodule_section = 'submodule "{}"'.format(sm['gitmodule_name']) # first deletions for dprop in assure_list(delete_property): parser.remove_option(submodule_section, dprop) # also kick from the info we just read above sm.pop('gitmodule_{}'.format(dprop), None) # and now setting values for sprop in assure_list(set_property): prop, val = sprop if val.startswith('<') and val.endswith('>') and '{' in val: # expand template string val = val[1:-1].format( **dict( sm, refds_relpath=relpath(sm['path'], refds_path), refds_relname=relpath(sm['path'], refds_path).replace(os.sep, '-'))) parser.set_value( submodule_section, prop, val) # also add to the info we just read above sm['gitmodule_{}'.format(prop)] = val Dataset(dspath).add( '.gitmodules', to_git=True, message='[DATALAD] modified subdataset properties') # let go of resources, locks, ... parser.release() #common = commonprefix((with_pathsep(subds), with_pathsep(path))) #if common.endswith(sep) and common == with_pathsep(subds): # candidates.append(common) subdsres = get_status_dict( 'subdataset', status='ok', type='dataset', logger=lgr) subdsres.update(sm) subdsres['parentds'] = dspath if not bottomup and \ (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled): yield subdsres # expand list with child submodules. keep all paths relative to parent # and convert jointly at the end if recursive and \ (recursion_limit in (None, 'existing') or (isinstance(recursion_limit, int) and recursion_limit > 1)): for r in _get_submodules( sm['path'], fulfilled, recursive, (recursion_limit - 1) if isinstance(recursion_limit, int) else recursion_limit, contains, bottomup, set_property, delete_property, refds_path): yield r if bottomup and \ (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled): yield subdsres if parser is not None: # release parser lock manually, auto-cleanup is not reliable in PY3 parser.release()
def __call__( path=None, dataset=None, # support passing this through in a path by path basis to_git=None, save=True, message=None, message_file=None, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path") refds_path = Interface.get_refds_path(dataset) common_report = dict(action='add', logger=lgr, refds=refds_path) if message and message_file: raise ValueError("Both a message and message file were specified") if message_file: with open(message_file, "rb") as mfh: message = assure_unicode(mfh.read()) to_add = [] subds_to_add = {} ds_to_annotate_from_recursion = {} got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=dataset, # never recursion, need to handle manually below to be able to # discover untracked content recursive=False, action='add', # speed things up by using Git's modification detection, if there # is a repo with at least one commit modified='HEAD' \ if dataset and \ GitRepo.is_valid_repo(refds_path) and \ GitRepo(refds_path).get_hexsha() \ else None, unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): got_nothing = False if ap.get('status', None): # this is done yield ap continue if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset': yield get_status_dict( status='impossible', message='"there is no dataset to add this path to', **dict(common_report, **ap)) continue if ap.get('type', None) == 'directory' and \ ap.get('state', None) == 'untracked' and \ GitRepo.is_valid_repo(ap['path']): # this is an untracked wannabe subdataset in disguise ap['type'] = 'dataset' if recursive and \ (ap.get('raw_input', False) or ap.get('state', None) in ('added', 'modified', 'untracked')) and \ (ap.get('parentds', None) or ap.get('type', None) == 'dataset'): # this was an actually requested input path, or a path that was found # modified by path annotation, based on an input argument # we need to recurse into all subdirs to find potentially # unregistered subdatasets # but only if this path has a parent, or is itself a dataset # otherwise there is nothing to add to _discover_subdatasets_recursively( ds_to_annotate_from_recursion, ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']], recursion_limit) # get the file content of the root dataset of this search added too # but be careful with extreme recursion_limit settings if recursion_limit is None or recursion_limit > 0: ap['process_content'] = True # record for further processing if not ap['path'] in ds_to_annotate_from_recursion: # if it was somehow already discovered to_add.append(ap) if got_nothing: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict( 'add', status='notneeded', path=refds_path, type='dataset', logger=lgr) return for subds in ds_to_annotate_from_recursion: if subds not in subds_to_add: # always prefer the already annotated path subds_to_add[subds] = ds_to_annotate_from_recursion[subds] if dataset: # we have a base dataset, discover any intermediate datasets between # the base and any already discovered dataset discovered = {} discover_dataset_trace_to_targets( # from here dataset.path, # to any dataset we are aware of subds_to_add.keys(), [], discovered) for parentds in discovered: for subds in discovered[parentds]: subds_to_add[subds] = subds_to_add.get( subds, dict(path=subds, parentds=parentds, type='dataset')) # merge custom paths and discovered dataset records, paths needs to go first, # because we know most about then, and subsequent annotation call we skip the # later duplicate ones to_add.extend(subds_to_add.values()) # and compact, this should be OK as all the info is in each ap dict to_add = unique(to_add, lambda x: x['path']) if not to_add: # nothing left to do, potentially all errored before return # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, it will also deduplicate annotated_paths = AnnotatePaths.__call__( path=to_add, dataset=dataset, # never recursion, done already recursive=False, action='add', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert(not completed) if not content_by_ds: # we should have complained about any inappropriate path argument # above, so if nothing is left, we can simply exit return # simple loop over datasets -- save happens later # start deep down to_save = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) torepoadd = {} respath_by_status = {} for ap in content_by_ds[ds_path]: # we have a new story ap.pop('status', None) torepoadd[ap['path']] = ap # skip anything that doesn't look like a wannabe subdataset if not ap.get('type', None) == 'dataset' or \ ap['path'] == ds_path: continue if ap.get('registered_subds', False): # subdataset that might be in this list because of the # need to save all the way up to a super dataset respath_by_status['success'] = \ respath_by_status.get('success', []) + [ap['path']] yield get_status_dict( status='notneeded', message="already known subdataset", **dict(common_report, **ap)) continue subds = Dataset(ap['path']) subds_relpath = relpath(ap['path'], ds_path) # Register the repository in the repo tree as a submodule try: ds.repo.add_submodule(subds_relpath, url=None, name=None) except (CommandError, InvalidGitRepositoryError) as e: yield get_status_dict( ds=subds, status='error', message=e.stderr, **dict(common_report, **ap)) continue # queue for saving using the updated annotated path ap['registered_subds'] = True # I hope this is true in direct mode too # TODO this is disabled, because in some circumstances # staging just doesn't happen, and it is unclear when # exactly -- the case that prompted disabling was a submodule # that had no content except for other submodules was not staged, # whereas another submodule on the same level in the same # superdataset which also has one file in it was staged # disable to work correctly, while paying a little bit of # slow down #ap['staged'] = True to_save.append(ap) # report added subdatasets -- `annex add` below won't do it yield get_status_dict( ds=subds, status='ok', message='added new subdataset', **dict(common_report, **ap)) # make sure that .gitmodules is added to the list of files gitmodules_path = opj(ds.path, '.gitmodules') # for git torepoadd[gitmodules_path] = dict(path=gitmodules_path) # and for save to_save.append(dict( path=gitmodules_path, parentds=ds_path, type='file')) # make sure any last minute additions make it to the saving stage # XXX? should content_by_ds become OrderedDict so that possible # super here gets processed last? lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd) is_annex = isinstance(ds.repo, AnnexRepo) add_kw = {'jobs': jobs} if is_annex and jobs else {} added = ds.repo.add_( list(torepoadd.keys()), git=to_git if is_annex else True, **add_kw ) for a in added: res = annexjson2result(a, ds, type='file', **common_report) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] # produce best possible path/result annotation if res['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report res = dict(torepoadd[res['path']], **res) # override this in all cases to be safe res['parentds'] = ds.path if success: # this was successfully added, queue for saving this very path # in the dataset ap = {k: v for k, v in res.items() if k != 'status'} ap['staged'] = True # strip any status and state info (e.g. save will refuse to save # stuff that is marked state='untracked' to_save.append({k: v for k, v in res.items() if k not in ('status', 'state')}) if a['file'] == '.gitmodules': # filter out .gitmodules, because this is only included for # technical reasons and has nothing to do with the actual content continue if GitRepo.is_valid_repo(res['path']): # more accurate report in case of an added submodule # mountpoint. # XXX Actually not sure if this can really happen # (depends on what our low-level code would do) # but worst case is that we loose a little bit of # coverage... res['type'] = 'dataset' res['message'] = 'added new state as submodule' yield res for r in results_from_annex_noinfo( ds, torepoadd, respath_by_status, dir_fail_msg='could not add some content in %s %s', noinfo_dir_msg='nothing to add from %s', noinfo_file_msg='already included in the dataset', action='add', logger=lgr, refds=refds_path): if r['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report r = dict(r, **torepoadd[r['path']]) if r['status'] == 'notneeded': # this could be a file that was staged already, it doesn't need # to be added, but it should be saved/commited if so desired to_save.append({k: v for k, v in r.items() if k not in ('status', 'state')}) # XXX something is fishy with the next one, rethink when sober.... if r['path'] == ds_path and r['status'] == 'ok': # this is for the entire dataset itself which was explicitly requested # make sure to save all r['type'] = 'dataset' r['process_content'] = True to_save.append({k: v for k, v in r.items() if k != 'status'}) yield r if refds_path and ds_path != refds_path and len(respath_by_status.get('success', [])): # TODO XXX we have an issue here when with `add('.')` and annex ignores any # dotfiles. In this case we end up not saving a dataset completely, because # we rely on accurate reporting. there is an issue about this already # TODO look up the issue ID # if there is a base dataset, but we are below it, and we have anything done to this # dataset -> queue dataset itself for saving its state in the parent ds_ap = dict( path=ds.path, # we have to look for the parent here, as we must save the # subdataset in the parent and not the whole subdataset itself type='dataset') parentds = get_dataset_root(normpath(opj(ds.path, pardir))) if parentds: ds_ap['parentds'] = parentds if dataset: ds_ap['refds'] = refds_path to_save.append(ds_ap) if not save: lgr.debug('Not calling `save` as instructed') return # TODO tell save what was staged already! Set 'staged=True' for # respective annotated paths that are fed into `save` # do not reuse any of the sorting done in here for saving, but instead # pass on all the annotated paths to have `save` figure out what to do with # them -- this is costs something, but should be safer, and frankly is # more comprehensible for res in Save.__call__( # hand-selected annotated paths path=to_save, dataset=refds_path, message=message if message else '[DATALAD] added content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__( path=None, dataset=None, # support passing this through in a path by path basis to_git=None, save=True, message=None, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path" ) refds_path = Interface.get_refds_path(dataset) common_report = dict(action='add', logger=lgr, refds=refds_path) to_add = [] subds_to_add = {} ds_to_annotate_from_recursion = {} got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=dataset, # never recursion, need to handle manually below to be able to # discover untracked content recursive=False, action='add', # speed things up by using Git's modification detection, if there # is a repo with at least one commit modified='HEAD' \ if dataset and \ GitRepo.is_valid_repo(refds_path) and \ GitRepo(refds_path).get_hexsha() \ else None, unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): got_nothing = False if ap.get('status', None): # this is done yield ap continue if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset': yield get_status_dict( status='impossible', message='"there is no dataset to add this path to', **dict(common_report, **ap)) continue if ap.get('type', None) == 'directory' and \ ap.get('state', None) == 'untracked' and \ GitRepo.is_valid_repo(ap['path']): # this is an untracked wannabe subdataset in disguise ap['type'] = 'dataset' if recursive and \ (ap.get('raw_input', False) or ap.get('state', None) in ('added', 'modified', 'untracked')) and \ (ap.get('parentds', None) or ap.get('type', None) == 'dataset'): # this was an actually requested input path, or a path that was found # modified by path annotation, based on an input argument # we need to recurse into all subdirs to find potentially # unregistered subdatasets # but only if this path has a parent, or is itself a dataset # otherwise there is nothing to add to _discover_subdatasets_recursively( ds_to_annotate_from_recursion, ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']], recursion_limit) # get the file content of the root dataset of this search added too # but be careful with extreme recursion_limit settings if recursion_limit is None or recursion_limit > 0: ap['process_content'] = True # record for further processing if not ap['path'] in ds_to_annotate_from_recursion: # if it was somehow already discovered to_add.append(ap) # TODO check if next isn't covered by discover_dataset_trace_to_targets already?? if dataset and ap.get('type', None) == 'dataset': # duplicates not possible, annotated_paths returns unique paths subds_to_add[ap['path']] = ap if got_nothing: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict('add', status='notneeded', path=refds_path, type='dataset', logger=lgr) return for subds in ds_to_annotate_from_recursion: if subds not in subds_to_add: # always prefer the already annotated path subds_to_add[subds] = ds_to_annotate_from_recursion[subds] if dataset: # we have a base dataset, discover any intermediate datasets between # the base and any already discovered dataset discovered = {} discover_dataset_trace_to_targets( # from here dataset.path, # to any dataset we are aware of subds_to_add.keys(), [], discovered) for parentds in discovered: for subds in discovered[parentds]: subds_to_add[subds] = subds_to_add.get( subds, dict(path=subds, parentds=parentds, type='dataset')) # merge custom paths and discovered dataset records, paths needs to go first, # because we know most about then, and subsequent annotation call we skip the # later duplicate ones to_add.extend(subds_to_add.values()) # and compact, this should be OK as all the info is in each ap dict to_add = unique(to_add, lambda x: x['path']) if not to_add: # nothing left to do, potentially all errored before return # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, it will also deduplicate annotated_paths = AnnotatePaths.__call__( path=to_add, dataset=dataset, # never recursion, done already recursive=False, action='add', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert (not completed) if not content_by_ds: # we should have complained about any inappropriate path argument # above, so if nothing is left, we can simply exit return # simple loop over datasets -- save happens later # start deep down to_save = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) torepoadd = {} respath_by_status = {} for ap in content_by_ds[ds_path]: # we have a new story ap.pop('status', None) torepoadd[ap['path']] = ap # skip anything that doesn't look like a wannabe subdataset if not ap.get('type', None) == 'dataset' or \ ap['path'] == ds_path: continue if ap.get('registered_subds', False): # subdataset that might be in this list because of the # need to save all the way up to a super dataset respath_by_status['success'] = \ respath_by_status.get('success', []) + [ap['path']] yield get_status_dict(status='notneeded', message="already known subdataset", **dict(common_report, **ap)) continue subds = Dataset(ap['path']) # check that the subds has a commit, and refuse # to operate on it otherwise, or we would get a bastard # submodule that cripples git operations if not subds.repo.get_hexsha(): yield get_status_dict( ds=subds, status='impossible', message='cannot add subdataset with no commits', **dict(common_report, **ap)) continue subds_relpath = relpath(ap['path'], ds_path) # make an attempt to configure a submodule source URL based on the # discovered remote configuration remote, branch = subds.repo.get_tracking_branch() subds_url = subds.repo.get_remote_url( remote) if remote else None # Register the repository in the repo tree as a submodule try: ds.repo.add_submodule(subds_relpath, url=subds_url, name=None) except CommandError as e: yield get_status_dict(ds=subds, status='error', message=e.stderr, **dict(common_report, **ap)) continue # queue for saving using the updated annotated path ap['registered_subds'] = True # I hope this is true in direct mode too # TODO this is disabled, because in some circumstances # staging just doesn't happen, and it is unclear when # exactly -- the case that prompted disabling was a submodule # that had no content except for other submodules was not staged, # whereas another submodule on the same level in the same # superdataset which also has one file in it was staged # disable to work correctly, while paying a little bit of # slow down #ap['staged'] = True to_save.append(ap) _fixup_submodule_dotgit_setup(ds, subds_relpath) # report added subdatasets -- `annex add` below won't do it yield get_status_dict(ds=subds, status='ok', message='added new subdataset', **dict(common_report, **ap)) # make sure that .gitmodules is added to the list of files gitmodules_path = opj(ds.path, '.gitmodules') # for git torepoadd[gitmodules_path] = dict(path=gitmodules_path) # and for save to_save.append( dict(path=gitmodules_path, parentds=ds_path, type='file')) # make sure any last minute additions make it to the saving stage # XXX? should content_by_ds become OrderedDict so that possible # super here gets processed last? lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd) is_annex = isinstance(ds.repo, AnnexRepo) add_kw = {'jobs': jobs} if is_annex and jobs else {} added = ds.repo.add(list(torepoadd.keys()), git=to_git if is_annex else True, **add_kw) for a in added: res = annexjson2result(a, ds, type='file', **common_report) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] # produce best possible path/result annotation if res['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report res = dict(torepoadd[res['path']], **res) # override this in all cases to be safe res['parentds'] = ds.path if success: # this was successfully added, queue for saving this very path # in the dataset ap = {k: v for k, v in res.items() if k != 'status'} ap['staged'] = True # strip any status and state info (e.g. save will refuse to save # stuff that is marked state='untracked' to_save.append({ k: v for k, v in res.items() if k not in ('status', 'state') }) if a['file'] == '.gitmodules': # filter out .gitmodules, because this is only included for # technical reasons and has nothing to do with the actual content continue if GitRepo.is_valid_repo(res['path']): # more accurate report in case of an added submodule # mountpoint. # XXX Actually not sure if this can really happen # (depends on what our low-level code would do) # but worst case is that we loose a little bit of # coverage... res['type'] = 'dataset' res['message'] = 'added new state as submodule' yield res for r in results_from_annex_noinfo( ds, torepoadd, respath_by_status, dir_fail_msg='could not add some content in %s %s', noinfo_dir_msg='nothing to add from %s', noinfo_file_msg='already included in the dataset', action='add', logger=lgr, refds=refds_path): if r['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report r = dict(r, **torepoadd[r['path']]) if r['status'] == 'notneeded': # this could be a file that was staged already, it doesn't need # to be added, but it should be saved/commited if so desired to_save.append({ k: v for k, v in r.items() if k not in ('status', 'state') }) # XXX something is fishy with the next one, rethink when sober.... if r['path'] == ds_path and r['status'] == 'ok': # this is for the entire dataset itself which was explicitly requested # make sure to save all r['type'] = 'dataset' r['process_content'] = True to_save.append( {k: v for k, v in r.items() if k != 'status'}) yield r if refds_path and ds_path != refds_path and len( respath_by_status.get('success', [])): # TODO XXX we have an issue here when with `add('.')` and annex ignores any # dotfiles. In this case we end up not saving a dataset completely, because # we rely on accurate reporting. there is an issue about this already # TODO look up the issue ID # if there is a base dataset, but we are below it, and we have anything done to this # dataset -> queue dataset itself for saving its state in the parent ds_ap = dict( path=ds.path, # we have to look for the parent here, as we must save the # subdataset in the parent and not the whole subdataset itself type='dataset') parentds = get_dataset_root(normpath(opj(ds.path, pardir))) if parentds: ds_ap['parentds'] = parentds if dataset: ds_ap['refds'] = refds_path to_save.append(ds_ap) if not save: lgr.debug('Not calling `save` as instructed') return # TODO tell save what was staged already! Set 'staged=True' for # respective annotated paths that are fed into `save` # do not reuse any of the sorting done in here for saving, but instead # pass on all the annotated paths to have `save` figure out what to do with # them -- this is costs something, but should be safer, and frankly is # more comprehensible for res in Save.__call__( # hand-selected annotated paths path=to_save, dataset=refds_path, message=message if message else '[DATALAD] added content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def repo(self): """Get an instance of the version control system/repo for this dataset, or None if there is none yet (or none anymore). If testing the validity of an instance of GitRepo is guaranteed to be really cheap this could also serve as a test whether a repo is present. Note, that this property is evaluated every time it is used. If used multiple times within a function it's probably a good idea to store its value in a local variable and use this variable instead. Returns ------- GitRepo or AnnexRepo """ # If we already got a *Repo instance, check whether it's still valid; # Note, that this basically does part of the testing that would # (implicitly) be done in the loop below again. So, there's still # potential to speed up when we actually need to get a new instance # (or none). But it's still faster for the vast majority of cases. # # TODO: Dig deeper into it and melt with new instance guessing. This # should also involve to reduce redundancy of testing such things from # within Flyweight.__call__, AnnexRepo.__init__ and GitRepo.__init__! # # Also note, that this could be forged into a single big condition, but # that is hard to read and we should be well aware of the actual # criteria here: if self._repo is not None and realpath(self.path) == self._repo.path: # we got a repo and path references still match if isinstance(self._repo, AnnexRepo): # it's supposed to be an annex if self._repo is AnnexRepo._unique_instances.get( self._repo.path, None) and \ AnnexRepo.is_valid_repo(self._repo.path, allow_noninitialized=True): # it's still the object registered as flyweight and it's a # valid annex repo return self._repo elif isinstance(self._repo, GitRepo): # it's supposed to be a plain git if self._repo is GitRepo._unique_instances.get( self._repo.path, None) and \ GitRepo.is_valid_repo(self._repo.path) and not \ self._repo.is_with_annex(): # it's still the object registered as flyweight, it's a # valid git repo and it hasn't turned into an annex return self._repo # Note: Although it looks like the "self._repo = None" assignments # could be used instead of variable "valid", that's a big difference! # The *Repo instances are flyweights, not singletons. self._repo might # be the last reference, which would lead to those objects being # destroyed and therefore the constructor call would result in an # actually new instance. This is unnecessarily costly. valid = False for cls, ckw, kw in ( # TODO: Do we really want to allow_noninitialized=True here? # And if so, leave a proper comment! (AnnexRepo, {'allow_noninitialized': True}, {'init': False}), (GitRepo, {}, {}) ): if cls.is_valid_repo(self._path, **ckw): try: lgr.log(5, "Detected %s at %s", cls, self._path) self._repo = cls(self._path, create=False, **kw) valid = True break except (InvalidGitRepositoryError, NoSuchPathError, InvalidAnnexRepositoryError) as exc: lgr.log(5, "Oops -- guess on repo type was wrong?: %s", exc_str(exc)) if not valid: self._repo = None if self._repo is None: # Often .repo is requested to 'sense' if anything is installed # under, and if so -- to proceed forward. Thus log here only # at DEBUG level and if necessary "complaint upstairs" lgr.log(5, "Failed to detect a valid repo at %s", self.path) elif due.active: # Makes sense only on installed dataset - @never_fail'ed duecredit_dataset(self) return self._repo
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)") # prep common result props res_kwargs = dict( action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive( refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = assure_list(path) if modified is not None: # modification detection would silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = resolve_path(p, ds=refds_path) if path_startswith(p, refds_path): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[r] = res yield res # preserve non-existing paths to be silently killed by modification # detection and append them to requested_paths again after detection. # TODO: This might be melted in with treatment of non dataset paths # above. Re-appending those paths seems to be better than yielding # directly to avoid code duplication, since both cases later on are # dealt with again. preserved_paths = [] if requested_paths: [preserved_paths.append(r) for r in requested_paths if not lexists(r['path'] if isinstance(r, dict) else r)] # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) from itertools import chain # re-append the preserved paths: requested_paths = chain(requested_paths, iter(preserved_paths)) # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path if not islink(path) else normpath(opj(path, pardir)) else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or ( refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root(normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not path_startswith(dspath, refds_path): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) subdss = containing_ds.subdatasets( fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get( 'status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset(parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return