def test_get_containing_subdataset(path): ds = create(path, force=True) ds.add(path='test.txt') ds.save("Initial commit") subds = ds.create("sub") subsubds = subds.create("subsub") eq_(ds.get_containing_subdataset(opj("sub", "subsub", "some")).path, subsubds.path) # the top of a subdataset belongs to the subdataset eq_(ds.get_containing_subdataset(opj("sub", "subsub")).path, subsubds.path) eq_(get_dataset_root(opj(ds.path, "sub", "subsub")), subsubds.path) eq_(ds.get_containing_subdataset(opj("sub", "some")).path, subds.path) eq_(ds.get_containing_subdataset("sub").path, subds.path) eq_(ds.get_containing_subdataset("some").path, ds.path) # make sure the subds is found, even when it is not present, but still # known shutil.rmtree(subds.path) eq_(ds.get_containing_subdataset(opj("sub", "some")).path, subds.path) eq_(ds.get_containing_subdataset("sub").path, subds.path) # # but now GitRepo disagrees... eq_(get_dataset_root(opj(ds.path, "sub")), ds.path) # and this stays, even if we give the mount point directory back os.makedirs(subds.path) eq_(get_dataset_root(opj(ds.path, "sub")), ds.path) outside_path = opj(os.pardir, "somewhere", "else") assert_raises(PathOutsideRepositoryError, ds.get_containing_subdataset, outside_path) assert_raises(PathOutsideRepositoryError, ds.get_containing_subdataset, opj(os.curdir, outside_path)) assert_raises(PathOutsideRepositoryError, ds.get_containing_subdataset, abspath(outside_path))
def test_get_containing_subdataset(path): ds = create(path, force=True) ds.add(path='test.txt') ds.save("Initial commit") subds = ds.create("sub") subsubds = subds.create("subsub") eq_( ds.get_containing_subdataset(opj("sub", "subsub", "some")).path, subsubds.path) # the top of a subdataset belongs to the subdataset eq_(ds.get_containing_subdataset(opj("sub", "subsub")).path, subsubds.path) eq_(get_dataset_root(opj(ds.path, "sub", "subsub")), subsubds.path) eq_(ds.get_containing_subdataset(opj("sub", "some")).path, subds.path) eq_(ds.get_containing_subdataset("sub").path, subds.path) eq_(ds.get_containing_subdataset("some").path, ds.path) # make sure the subds is found, even when it is not present, but still # known shutil.rmtree(subds.path) eq_(ds.get_containing_subdataset(opj("sub", "some")).path, subds.path) eq_(ds.get_containing_subdataset("sub").path, subds.path) # # but now GitRepo disagrees... eq_(get_dataset_root(opj(ds.path, "sub")), ds.path) # and this stays, even if we give the mount point directory back os.makedirs(subds.path) eq_(get_dataset_root(opj(ds.path, "sub")), ds.path) outside_path = opj(os.pardir, "somewhere", "else") assert_raises(PathOutsideRepositoryError, ds.get_containing_subdataset, outside_path) assert_raises(PathOutsideRepositoryError, ds.get_containing_subdataset, opj(os.curdir, outside_path)) assert_raises(PathOutsideRepositoryError, ds.get_containing_subdataset, abspath(outside_path))
def get_superdataset(self, datalad_only=False, topmost=False, registered_only=True): """Get the dataset's superdataset Parameters ---------- datalad_only : bool, optional Either to consider only "datalad datasets" (with non-None id), or (if False, which is default) - any git repository topmost : bool, optional Return the topmost super-dataset. Might then be the current one. registered_only : bool, optional Test whether any discovered superdataset actually contains the dataset in question as a registered subdataset (as opposed to just being located in a subdirectory without a formal relationship). Returns ------- Dataset or None """ # TODO: return only if self is subdataset of the superdataset # (meaning: registered as submodule)? path = self.path sds_path = path if topmost else None while path: # normalize the path after adding .. so we guaranteed to not # follow into original directory if path itself is a symlink par_path = normpath(opj(path, pardir)) sds_path_ = get_dataset_root(par_path) if sds_path_ is None: # no more parents, use previous found break sds = Dataset(sds_path_) if datalad_only: # test if current git is actually a dataset? if not sds.id: break if registered_only: if path not in sds.subdatasets( recursive=False, contains=path, result_xfm='paths'): break # That was a good candidate sds_path = sds_path_ path = par_path if not topmost: # no looping break if sds_path is None: # None was found return None # No postprocessing now should be necessary since get_toppath # tries its best to not resolve symlinks now return Dataset(sds_path)
def get_command_pwds(dataset): """Return the directory for the command. Parameters ---------- dataset : Dataset Returns ------- A tuple, where the first item is the absolute path of the pwd and the second is the pwd relative to the dataset's path. """ if dataset: pwd = dataset.path rel_pwd = curdir else: # act on the whole dataset if nothing else was specified # Follow our generic semantic that if dataset is specified, # paths are relative to it, if not -- relative to pwd pwd = getpwd() # Pass pwd to get_dataset_root instead of os.path.curdir to handle # repos whose leading paths have a symlinked directory (see the # TMPDIR="/var/tmp/sym link" test case). dataset = get_dataset_root(pwd) if dataset: rel_pwd = relpath(pwd, dataset) else: rel_pwd = pwd # and leave handling on deciding either we # deal with it or crash to checks below return pwd, rel_pwd
def get_command_pwds(dataset): """Return the current directory for the dataset. Parameters ---------- dataset : Dataset Returns ------- A tuple, where the first item is the absolute path of the pwd and the second is the pwd relative to the dataset's path. """ # Follow path resolution logic describe in gh-3435. if isinstance(dataset, Dataset): # Paths relative to dataset. pwd = dataset.path rel_pwd = op.curdir else: # Paths relative to current directory. pwd = getpwd() # Pass pwd to get_dataset_root instead of os.path.curdir to handle # repos whose leading paths have a symlinked directory (see the # TMPDIR="/var/tmp/sym link" test case). if not dataset: dataset = get_dataset_root(pwd) if dataset: rel_pwd = relpath(pwd, dataset) else: rel_pwd = pwd # and leave handling to caller return pwd, rel_pwd
def get_command_pwds(dataset): """Return the directory for the command. Parameters ---------- dataset : Dataset Returns ------- A tuple, where the first item is the absolute path of the pwd and the second is the pwd relative to the dataset's path. """ if dataset: pwd = dataset.path rel_pwd = curdir else: # act on the whole dataset if nothing else was specified dataset = get_dataset_root(curdir) # Follow our generic semantic that if dataset is specified, # paths are relative to it, if not -- relative to pwd pwd = getpwd() if dataset: rel_pwd = relpath(pwd, dataset) else: rel_pwd = pwd # and leave handling on deciding either we # deal with it or crash to checks below return pwd, rel_pwd
def get_repo_pipeline_config_path(repo_path=curdir): """Given a path within a repo, return path to the crawl.cfg""" if not exists(opj(repo_path, HANDLE_META_DIR)): # we need to figure out top path for the repo repo_path = get_dataset_root(repo_path) if not repo_path: return None return opj(repo_path, CRAWLER_META_CONFIG_PATH)
def _flyweight_preproc_path(cls, path): """Custom handling for few special abbreviations for datasets""" path_ = path if path == '^': # get the topmost dataset from current location. Note that 'zsh' # might have its ideas on what to do with ^, so better use as -d^ path_ = Dataset(get_dataset_root(curdir)).get_superdataset( topmost=True).path elif path == '^.': # get the dataset containing current directory path_ = get_dataset_root(curdir) elif path == '///': # TODO: logic/UI on installing a default dataset could move here # from search? path_ = cfg.obtain('datalad.locations.default-dataset') if path != path_: lgr.debug("Resolved dataset alias %r to path %r", path, path_) return path_
def get_superdataset(self, datalad_only=False, topmost=False): """Get the dataset's superdataset Parameters ---------- datalad_only : bool, optional Either to consider only "datalad datasets" (with non-None id), or (if False, which is default) - any git repository topmost : bool, optional Return the topmost super-dataset. Might then be the current one. Returns ------- Dataset or None """ # TODO: return only if self is subdataset of the superdataset # (meaning: registered as submodule)? path = self.path sds_path = path if topmost else None while path: # normalize the path after adding .. so we guaranteed to not # follow into original directory if path itself is a symlink par_path = normpath(opj(path, pardir)) sds_path_ = get_dataset_root(par_path) if sds_path_ is None: # no more parents, use previous found break if datalad_only: # test if current git is actually a dataset? sds = Dataset(sds_path_) # can't use ATM since we just autogenerate and ID, see # https://github.com/datalad/datalad/issues/986 # if not sds.id: if not sds.config.get('datalad.dataset.id', None): break # That was a good candidate sds_path = sds_path_ path = par_path if not topmost: # no looping break if sds_path is None: # None was found return None # No postprocessing now should be necessary since get_toppath # tries its best to not resolve symlinks now return Dataset(sds_path)
def path_under_rev_dataset(ds, path): ds_path = ds.pathobj try: rpath = str(ut.Path(path).relative_to(ds_path)) if not rpath.startswith(op.pardir): # path is already underneath the dataset return path except Exception: # whatever went wrong, we gotta play save pass root = get_dataset_root(str(path)) while root is not None and not ds_path.samefile(root): # path and therefore root could be relative paths, # hence in the next round we cannot use dirname() # to jump in the the next directory up, but we have # to use ./.. and get_dataset_root() will handle # the rest just fine root = get_dataset_root(op.join(root, op.pardir)) if root is None: return None return ds_path / op.relpath(str(path), root)
def get_repo_pipeline_script_path(repo_path=curdir): """If there is a single pipeline present among 'pipelines/', return path to it""" # TODO: somewhat adhoc etc -- may be improve with some dedicated name being # tracked or smth like that if not exists(opj(repo_path, HANDLE_META_DIR)): # we need to figure out top path for the repo repo_path = get_dataset_root(repo_path) if not repo_path: return None pipelines = glob(opj(repo_path, CRAWLER_META_DIR, 'pipelines', '*.py')) if len(pipelines) > 1 or not pipelines: return None return pipelines[0]
def require_dataset(dataset, check_installed=True, purpose=None): """Helper function to resolve a dataset. This function tries to resolve a dataset given an input argument, or based on the process' working directory, if `None` is given. Parameters ---------- dataset : None or path or Dataset Some value identifying a dataset or `None`. In the latter case a dataset will be searched based on the process working directory. check_installed : bool, optional If True, an optional check whether the resolved dataset is properly installed will be performed. purpose : str, optional This string will be inserted in error messages to make them more informative. The pattern is "... dataset for <STRING>". Returns ------- Dataset If a dataset could be determined. Raises ------ NoDatasetFound If not dataset could be determined. """ if dataset is not None and not isinstance(dataset, Dataset): dataset = Dataset(dataset) if dataset is None: # possible scenario of cmdline calls dspath = get_dataset_root(getpwd()) if not dspath: raise NoDatasetFound( "No dataset found at '{}'{}. Specify a dataset to work with " "by providing its path via the `dataset` option, " "or change the current working directory to be in a " "dataset.".format( getpwd(), " for the purpose {!r}".format(purpose) if purpose else '')) dataset = Dataset(dspath) assert (dataset is not None) lgr.debug(u"Resolved dataset%s: %s", u' to {}'.format(purpose) if purpose else '', dataset.path) if check_installed and not dataset.is_installed(): raise NoDatasetFound(f"No installed dataset found at {dataset.path}") return dataset
def candidates(name): if not name.endswith('.py'): name += '.py' # first -- current directory repo_path = get_dataset_root(curdir) if repo_path: yield opj(repo_path, CRAWLER_META_DIR, 'pipelines', name) # TODO: look under other .datalad locations as well # last -- within datalad code yield opj(dirname(__file__), 'pipelines', name) # datalad's module shipped within it
def _get_repo_record(fpath, cache): fdir = fpath.parent # get the repository, if there is any. # `src_repo` will be None, if there is none, and can serve as a flag # for further processing repo_rec = cache.get(fdir, None) if repo_rec is None: repo_root = get_dataset_root(fdir) repo_rec = dict( repo=None if repo_root is None else Dataset(repo_root).repo, # this is different from repo.pathobj which resolves symlinks repo_root=Path(repo_root) if repo_root else None) cache[fdir] = repo_rec return repo_rec
def __init__(self, annex, path=None, persistent_cache=True, **kwargs): super().__init__(annex) # MIH figure out what the following is all about # in particular path==None self.repo = Dataset(get_dataset_root(Path.cwd())).repo \ if not path \ else AnnexRepo(path, create=False, init=False) self.path = self.repo.path # annex requests load by KEY not but URL which it originally asked # about. So for a key we might get back multiple URLs and as a # heuristic let's use the most recently asked one self._last_url = None # for heuristic to choose among multiple URLs self._cache = ArchivesCache(self.path, persistent=persistent_cache) self._contentlocations = DictCache(size_limit=100) # TODO: config ?
def get_repo_instance(path=os.curdir, class_=None): """Returns an instance of appropriate datalad repository for path. Check whether a certain path is inside a known type of repository and returns an instance representing it. May also check for a certain type instead of detecting the type of repository. .. deprecated:: 0.16 Use the pattern `Dataset(get_dataset_root(path)).repo` instead. This function will be removed in a future release. Parameters ---------- path: str path to check; default: current working directory class_: class if given, check whether path is inside a repository, that can be represented as an instance of the passed class. Raises ------ RuntimeError, in case cwd is not inside a known repository. """ warnings.warn( "get_repo_instance() was deprecated in 0.16. " "It will be removed in a future release.", DeprecationWarning) from datalad.utils import get_dataset_root from datalad.distribution.dataset import Dataset from datalad.support.annexrepo import AnnexRepo from datalad.support.gitrepo import GitRepo if class_ is not None: if class_ == AnnexRepo: type_ = "annex" elif class_ == GitRepo: type_ = "git" else: raise RuntimeError("Unknown class %s." % str(class_)) else: type_ = '' dsroot = get_dataset_root(path) if not dsroot: raise RuntimeError(f"No {type_}s repository found at {path}.") return Dataset(dsroot).repo
def get_dataset_directories(top, ignore_datalad=True): """Return a list of directories in the same dataset under a given path Parameters ---------- top : path Top-level path ignore_datalad : bool Whether to exlcude the '.datalad' directory of a dataset and its content from the results. Returns ------- list List of directories matching the top-level path, regardless of whether these directories are known to Git (i.e. contain tracked files). The list does not include the top-level path itself, but it does include any subdataset mount points (regardless of whether the particular subdatasets are installed or not). """ def func(arg, top, names): refpath, ignore, dirs = arg legit_names = [] for n in names: path = opj(top, n) if not isdir(path) or path in ignore: pass elif path != refpath and GitRepo.is_valid_repo(path): # mount point, keep but don't dive into dirs.append(path) else: legit_names.append(n) dirs.append(path) names[:] = legit_names # collects the directories refpath = get_dataset_root(top) if not refpath: raise ValueError("`top` path {} is not in a dataset".format(top)) ignore = [opj(refpath, get_git_dir(refpath))] if ignore_datalad: ignore.append(opj(refpath, '.datalad')) d = [] walk(top, func, (refpath, ignore, d)) return d
def require_dataset(dataset, check_installed=True, purpose=None): """Helper function to resolve a dataset. This function tries to resolve a dataset given an input argument, or based on the process' working directory, if `None` is given. Parameters ---------- dataset : None or path or Dataset Some value identifying a dataset or `None`. In the latter case a dataset will be searched based on the process working directory. check_installed : bool, optional If True, an optional check whether the resolved dataset is properly installed will be performed. purpose : str, optional This string will be inserted in error messages to make them more informative. The pattern is "... dataset for <STRING>". Returns ------- Dataset Or raises an exception (InsufficientArgumentsError). """ if dataset is not None and not isinstance(dataset, Dataset): dataset = Dataset(dataset) if dataset is None: # possible scenario of cmdline calls dspath = get_dataset_root(getpwd()) if not dspath: raise NoDatasetArgumentFound("No dataset found") dataset = Dataset(dspath) assert(dataset is not None) lgr.debug("Resolved dataset{0}: {1}".format( ' for {}'.format(purpose) if purpose else '', dataset)) if check_installed and not dataset.is_installed(): raise ValueError("No installed dataset found at " "{0}.".format(dataset.path)) return dataset
def require_dataset(dataset, check_installed=True, purpose=None): """Helper function to resolve a dataset. This function tries to resolve a dataset given an input argument, or based on the process' working directory, if `None` is given. Parameters ---------- dataset : None or path or Dataset Some value identifying a dataset or `None`. In the latter case a dataset will be searched based on the process working directory. check_installed : bool, optional If True, an optional check whether the resolved dataset is properly installed will be performed. purpose : str, optional This string will be inserted in error messages to make them more informative. The pattern is "... dataset for <STRING>". Returns ------- Dataset Or raises an exception (InsufficientArgumentsError). """ if dataset is not None and not isinstance(dataset, Dataset): dataset = Dataset(dataset) if dataset is None: # possible scenario of cmdline calls dspath = get_dataset_root(getpwd()) if not dspath: raise NoDatasetArgumentFound("No dataset found") dataset = Dataset(dspath) assert(dataset is not None) lgr.debug(u"Resolved dataset{0}: {1}".format( ' for {}'.format(purpose) if purpose else '', dataset)) if check_installed and not dataset.is_installed(): raise ValueError("No installed dataset found at " "{0}.".format(dataset.path)) return dataset
def process_vanished_paths(unavailable_paths, content_by_ds): # presently unavailable paths could be, e.g., deleted files, or # uninstalled subdatasets, or simply nothing -> figure it out and act # accordingly dsinfo = {} nonexistent_paths = [] for p in unavailable_paths: # we need to check whether any of these correspond # to a known subdataset, and add those to the list of # things to be removed toppath = get_dataset_root(p) if not toppath: nonexistent_paths.append(p) continue ds = Dataset(toppath) dinfo = dsinfo.get( toppath, { 'deleted': ds.repo.get_deleted_files(), 'subds': ds.get_subdatasets(recursive=False, absolute=True) }) # cache for a potentially following request dsinfo[toppath] = dinfo if p in dinfo['subds']: # test for subds needs to come first, as it would also show # up in "deleted_files" # this is a known subdataset that has vanished lgr.debug('deinit vanished subdataset {} in {}'.format(p, ds)) # simply deinit to complete a "forced uninstallation", without # an explicit "remove" there is nothing to be save in this # case ds.repo.deinit_submodule(p[len(_with_sep(ds.path)):]) elif p in dinfo['deleted']: # vanished file -> 'git rm' it to stage the change ds.repo.remove(p) # record that we are "saving" this path dpaths = content_by_ds.get(ds.path, []) dpaths.append(p) content_by_ds[ds.path] = dpaths else: # this is nothing we can anyhow handle nonexistent_paths.append(p) return content_by_ds, nonexistent_paths
def process_vanished_paths(unavailable_paths, content_by_ds): # presently unavailable paths could be, e.g., deleted files, or # uninstalled subdatasets, or simply nothing -> figure it out and act # accordingly dsinfo = {} nonexistent_paths = [] for p in unavailable_paths: # we need to check whether any of these correspond # to a known subdataset, and add those to the list of # things to be removed toppath = get_dataset_root(p) if not toppath: nonexistent_paths.append(p) continue ds = Dataset(toppath) dinfo = dsinfo.get(toppath, {'deleted': ds.repo.get_deleted_files(), 'subds': ds.get_subdatasets( recursive=False, absolute=True)}) # cache for a potentially following request dsinfo[toppath] = dinfo if p in dinfo['subds']: # test for subds needs to come first, as it would also show # up in "deleted_files" # this is a known subdataset that has vanished lgr.debug('deinit vanished subdataset {} in {}'.format(p, ds)) # simply deinit to complete a "forced uninstallation", without # an explicit "remove" there is nothing to be save in this # case ds.repo.deinit_submodule(p[len(_with_sep(ds.path)):]) elif p in dinfo['deleted']: # vanished file -> 'git rm' it to stage the change ds.repo.remove(p) # record that we are "saving" this path dpaths = content_by_ds.get(ds.path, []) dpaths.append(p) content_by_ds[ds.path] = dpaths else: # this is nothing we can anyhow handle nonexistent_paths.append(p) return content_by_ds, nonexistent_paths
def test_get_dataset_root(path): eq_(get_dataset_root('/nonexistent'), None) with chpwd(path): repo = AnnexRepo(os.curdir, create=True) subdir = opj('some', 'deep') fname = opj(subdir, 'dummy') os.makedirs(subdir) with open(fname, 'w') as f: f.write('some') repo.add(fname) # we can find this repo eq_(get_dataset_root(os.curdir), os.curdir) # and we get the type of path that we fed in eq_(get_dataset_root(abspath(os.curdir)), abspath(os.curdir)) # subdirs are no issue eq_(get_dataset_root(subdir), os.curdir) # even more subdirs are no issue eq_(get_dataset_root(opj(subdir, subdir)), os.curdir) # non-dir paths are no issue eq_(get_dataset_root(fname), os.curdir)
def __call__( path=None, dataset=None, # support passing this through in a path by path basis to_git=None, save=True, message=None, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path" ) refds_path = Interface.get_refds_path(dataset) common_report = dict(action='add', logger=lgr, refds=refds_path) to_add = [] subds_to_add = {} ds_to_annotate_from_recursion = {} got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=dataset, # never recursion, need to handle manually below to be able to # discover untracked content recursive=False, action='add', # speed things up by using Git's modification detection, if there # is a repo with at least one commit modified='HEAD' \ if dataset and \ GitRepo.is_valid_repo(refds_path) and \ GitRepo(refds_path).get_hexsha() \ else None, unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): got_nothing = False if ap.get('status', None): # this is done yield ap continue if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset': yield get_status_dict( status='impossible', message='"there is no dataset to add this path to', **dict(common_report, **ap)) continue if ap.get('type', None) == 'directory' and \ ap.get('state', None) == 'untracked' and \ GitRepo.is_valid_repo(ap['path']): # this is an untracked wannabe subdataset in disguise ap['type'] = 'dataset' if recursive and \ (ap.get('raw_input', False) or ap.get('state', None) in ('added', 'modified', 'untracked')) and \ (ap.get('parentds', None) or ap.get('type', None) == 'dataset'): # this was an actually requested input path, or a path that was found # modified by path annotation, based on an input argument # we need to recurse into all subdirs to find potentially # unregistered subdatasets # but only if this path has a parent, or is itself a dataset # otherwise there is nothing to add to _discover_subdatasets_recursively( ds_to_annotate_from_recursion, ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']], recursion_limit) # get the file content of the root dataset of this search added too # but be careful with extreme recursion_limit settings if recursion_limit is None or recursion_limit > 0: ap['process_content'] = True # record for further processing if not ap['path'] in ds_to_annotate_from_recursion: # if it was somehow already discovered to_add.append(ap) # TODO check if next isn't covered by discover_dataset_trace_to_targets already?? if dataset and ap.get('type', None) == 'dataset': # duplicates not possible, annotated_paths returns unique paths subds_to_add[ap['path']] = ap if got_nothing: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict('add', status='notneeded', path=refds_path, type='dataset', logger=lgr) return for subds in ds_to_annotate_from_recursion: if subds not in subds_to_add: # always prefer the already annotated path subds_to_add[subds] = ds_to_annotate_from_recursion[subds] if dataset: # we have a base dataset, discover any intermediate datasets between # the base and any already discovered dataset discovered = {} discover_dataset_trace_to_targets( # from here dataset.path, # to any dataset we are aware of subds_to_add.keys(), [], discovered) for parentds in discovered: for subds in discovered[parentds]: subds_to_add[subds] = subds_to_add.get( subds, dict(path=subds, parentds=parentds, type='dataset')) # merge custom paths and discovered dataset records, paths needs to go first, # because we know most about then, and subsequent annotation call we skip the # later duplicate ones to_add.extend(subds_to_add.values()) # and compact, this should be OK as all the info is in each ap dict to_add = unique(to_add, lambda x: x['path']) if not to_add: # nothing left to do, potentially all errored before return # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, it will also deduplicate annotated_paths = AnnotatePaths.__call__( path=to_add, dataset=dataset, # never recursion, done already recursive=False, action='add', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert (not completed) if not content_by_ds: # we should have complained about any inappropriate path argument # above, so if nothing is left, we can simply exit return # simple loop over datasets -- save happens later # start deep down to_save = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) torepoadd = {} respath_by_status = {} for ap in content_by_ds[ds_path]: # we have a new story ap.pop('status', None) torepoadd[ap['path']] = ap # skip anything that doesn't look like a wannabe subdataset if not ap.get('type', None) == 'dataset' or \ ap['path'] == ds_path: continue if ap.get('registered_subds', False): # subdataset that might be in this list because of the # need to save all the way up to a super dataset respath_by_status['success'] = \ respath_by_status.get('success', []) + [ap['path']] yield get_status_dict(status='notneeded', message="already known subdataset", **dict(common_report, **ap)) continue subds = Dataset(ap['path']) # check that the subds has a commit, and refuse # to operate on it otherwise, or we would get a bastard # submodule that cripples git operations if not subds.repo.get_hexsha(): yield get_status_dict( ds=subds, status='impossible', message='cannot add subdataset with no commits', **dict(common_report, **ap)) continue subds_relpath = relpath(ap['path'], ds_path) # make an attempt to configure a submodule source URL based on the # discovered remote configuration remote, branch = subds.repo.get_tracking_branch() subds_url = subds.repo.get_remote_url( remote) if remote else None # Register the repository in the repo tree as a submodule try: ds.repo.add_submodule(subds_relpath, url=subds_url, name=None) except CommandError as e: yield get_status_dict(ds=subds, status='error', message=e.stderr, **dict(common_report, **ap)) continue # queue for saving using the updated annotated path ap['registered_subds'] = True # I hope this is true in direct mode too # TODO this is disabled, because in some circumstances # staging just doesn't happen, and it is unclear when # exactly -- the case that prompted disabling was a submodule # that had no content except for other submodules was not staged, # whereas another submodule on the same level in the same # superdataset which also has one file in it was staged # disable to work correctly, while paying a little bit of # slow down #ap['staged'] = True to_save.append(ap) _fixup_submodule_dotgit_setup(ds, subds_relpath) # report added subdatasets -- `annex add` below won't do it yield get_status_dict(ds=subds, status='ok', message='added new subdataset', **dict(common_report, **ap)) # make sure that .gitmodules is added to the list of files gitmodules_path = opj(ds.path, '.gitmodules') # for git torepoadd[gitmodules_path] = dict(path=gitmodules_path) # and for save to_save.append( dict(path=gitmodules_path, parentds=ds_path, type='file')) # make sure any last minute additions make it to the saving stage # XXX? should content_by_ds become OrderedDict so that possible # super here gets processed last? lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd) is_annex = isinstance(ds.repo, AnnexRepo) add_kw = {'jobs': jobs} if is_annex and jobs else {} added = ds.repo.add(list(torepoadd.keys()), git=to_git if is_annex else True, **add_kw) for a in added: res = annexjson2result(a, ds, type='file', **common_report) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] # produce best possible path/result annotation if res['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report res = dict(torepoadd[res['path']], **res) # override this in all cases to be safe res['parentds'] = ds.path if success: # this was successfully added, queue for saving this very path # in the dataset ap = {k: v for k, v in res.items() if k != 'status'} ap['staged'] = True # strip any status and state info (e.g. save will refuse to save # stuff that is marked state='untracked' to_save.append({ k: v for k, v in res.items() if k not in ('status', 'state') }) if a['file'] == '.gitmodules': # filter out .gitmodules, because this is only included for # technical reasons and has nothing to do with the actual content continue if GitRepo.is_valid_repo(res['path']): # more accurate report in case of an added submodule # mountpoint. # XXX Actually not sure if this can really happen # (depends on what our low-level code would do) # but worst case is that we loose a little bit of # coverage... res['type'] = 'dataset' res['message'] = 'added new state as submodule' yield res for r in results_from_annex_noinfo( ds, torepoadd, respath_by_status, dir_fail_msg='could not add some content in %s %s', noinfo_dir_msg='nothing to add from %s', noinfo_file_msg='already included in the dataset', action='add', logger=lgr, refds=refds_path): if r['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report r = dict(r, **torepoadd[r['path']]) if r['status'] == 'notneeded': # this could be a file that was staged already, it doesn't need # to be added, but it should be saved/commited if so desired to_save.append({ k: v for k, v in r.items() if k not in ('status', 'state') }) # XXX something is fishy with the next one, rethink when sober.... if r['path'] == ds_path and r['status'] == 'ok': # this is for the entire dataset itself which was explicitly requested # make sure to save all r['type'] = 'dataset' r['process_content'] = True to_save.append( {k: v for k, v in r.items() if k != 'status'}) yield r if refds_path and ds_path != refds_path and len( respath_by_status.get('success', [])): # TODO XXX we have an issue here when with `add('.')` and annex ignores any # dotfiles. In this case we end up not saving a dataset completely, because # we rely on accurate reporting. there is an issue about this already # TODO look up the issue ID # if there is a base dataset, but we are below it, and we have anything done to this # dataset -> queue dataset itself for saving its state in the parent ds_ap = dict( path=ds.path, # we have to look for the parent here, as we must save the # subdataset in the parent and not the whole subdataset itself type='dataset') parentds = get_dataset_root(normpath(opj(ds.path, pardir))) if parentds: ds_ap['parentds'] = parentds if dataset: ds_ap['refds'] = refds_path to_save.append(ds_ap) if not save: lgr.debug('Not calling `save` as instructed') return # TODO tell save what was staged already! Set 'staged=True' for # respective annotated paths that are fed into `save` # do not reuse any of the sorting done in here for saving, but instead # pass on all the annotated paths to have `save` figure out what to do with # them -- this is costs something, but should be safer, and frankly is # more comprehensible for res in Save.__call__( # hand-selected annotated paths path=to_save, dataset=refds_path, message=message if message else '[DATALAD] added content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__(path=None, force=False, description=None, dataset=None, no_annex=False, save=True, annex_version=None, annex_backend='MD5E', native_metadata_type=None, shared_access=None, git_opts=None, annex_opts=None, annex_init_opts=None, text_no_annex=None): # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD if path and dataset: # Given a path and a dataset (path) not pointing to installed # dataset if not dataset.is_installed(): msg = "No installed dataset at %s found." % dataset.path dsroot = get_dataset_root(dataset.path) if dsroot: msg += " If you meant to add to the %s dataset, use that path " \ "instead but remember that if dataset is provided, " \ "relative paths are relative to the top of the " \ "dataset." % dsroot raise ValueError(msg) # sanity check first if git_opts: lgr.warning( "`git_opts` argument is presently ignored, please complain!") if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if annex_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex and declaring no " "annex repo.") if annex_init_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex init and declaring no " "annex repo.") if not isinstance(force, bool): raise ValueError( "force should be bool, got %r. Did you mean to provide a 'path'?" % force) annotated_paths = AnnotatePaths.__call__( # nothing given explicitly, assume create fresh right here path=path if path else getpwd() if dataset is None else None, dataset=dataset, recursive=False, action='create', # we need to know whether we have to check for potential # subdataset collision force_parentds_discovery=True, # it is absolutely OK to have something that does not exist unavailable_path_status='', unavailable_path_msg=None, # if we have a dataset given that actually exists, we want to # fail if the requested path is not in it nondataset_path_status='error' \ if isinstance(dataset, Dataset) and dataset.is_installed() else '', on_failure='ignore') path = None for r in annotated_paths: if r['status']: # this is dealt with already yield r continue if path is not None: raise ValueError( "`create` can only handle single target path or dataset") path = r if len(annotated_paths) and path is None: # we got something, we complained already, done return # we know that we need to create a dataset at `path` assert (path is not None) # prep for yield path.update({'logger': lgr, 'type': 'dataset'}) # just discard, we have a new story to tell path.pop('message', None) if 'parentds' in path: subs = Subdatasets.__call__( dataset=path['parentds'], # any known fulfilled=None, recursive=False, contains=path['path'], result_xfm='relpaths') if len(subs): path.update({ 'status': 'error', 'message': ('collision with known subdataset %s/ in dataset %s', subs[0], path['parentds']) }) yield path return # TODO here we need a further test that if force=True, we need to look if # there is a superdataset (regardless of whether we want to create a # subdataset or not), and if that superdataset tracks anything within # this directory -- if so, we need to stop right here and whine, because # the result of creating a repo here will produce an undesired mess if git_opts is None: git_opts = {} if shared_access: # configure `git --shared` value git_opts['shared'] = shared_access # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = dataset if isinstance(dataset, Dataset) and dataset.path == path['path'] \ else Dataset(path['path']) # don't create in non-empty directory without `force`: if isdir(tbds.path) and listdir(tbds.path) != [] and not force: path.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`force` option to ignore' }) yield path return if no_annex: lgr.info("Creating a new git repo at %s", tbds.path) GitRepo(tbds.path, url=None, create=True, git_opts=git_opts) else: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", tbds.path) tbrepo = AnnexRepo(tbds.path, url=None, create=True, backend=annex_backend, version=annex_version, description=description, git_opts=git_opts, annex_opts=annex_opts, annex_init_opts=annex_init_opts) if text_no_annex: git_attributes_file = opj(tbds.path, '.gitattributes') with open(git_attributes_file, 'a') as f: f.write('* annex.largefiles=(not(mimetype=text/*))\n') tbrepo.add([git_attributes_file], git=True) tbrepo.commit("Instructed annex to add text files to git", _datalad_msg=True, files=[git_attributes_file]) if native_metadata_type is not None: if not isinstance(native_metadata_type, list): native_metadata_type = [native_metadata_type] for nt in native_metadata_type: tbds.config.add('datalad.metadata.nativetype', nt) # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' if id_var in tbds.config: # make sure we reset this variable completely, in case of a re-create tbds.config.unset(id_var, where='dataset') tbds.config.add(id_var, tbds.id if tbds.id is not None else uuid.uuid1().urn.split(':')[-1], where='dataset') # make sure that v6 annex repos never commit content under .datalad with open(opj(tbds.path, '.datalad', '.gitattributes'), 'a') as gitattr: # TODO this will need adjusting, when annex'ed aggregate meta data # comes around gitattr.write( '# Text files (according to file --mime-type) are added directly to git.\n' ) gitattr.write( '# See http://git-annex.branchable.com/tips/largefiles/ for more info.\n' ) gitattr.write('** annex.largefiles=nothing\n') # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbds.add('.datalad', to_git=True, save=save, message='[DATALAD] new dataset') # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if save and isinstance(dataset, Dataset) and dataset.path != tbds.path: # we created a dataset in another dataset # -> make submodule for r in dataset.add(tbds.path, save=True, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r path.update({'status': 'ok'}) yield path
def __call__( # it is optional, because `rerun` can get a recorded one cmd=None, dataset=None, message=None, rerun=False): if rerun and cmd: lgr.warning('Ignoring provided command in --rerun mode') cmd = None if not dataset: # act on the whole dataset if nothing else was specified dataset = get_dataset_root(curdir) ds = require_dataset(dataset, check_installed=True, purpose='tracking outcomes of a command') # not needed ATM #refds_path = ds.path # delayed imports from datalad.cmd import Runner from datalad.tests.utils import ok_clean_git lgr.debug('tracking command output underneath %s', ds) try: # base assumption is that the animal smells superb ok_clean_git(ds.path) except AssertionError: yield get_status_dict( 'run', ds=ds, status='impossible', message= 'unsaved modifications present, cannot detect changes by command' ) return if not cmd and not rerun: # TODO here we would need to recover a cmd when a rerun is attempted return if rerun: # pull run info out of the last commit message err_info = get_status_dict('run', ds=ds) if not ds.repo.get_hexsha(): yield dict(err_info, status='impossible', message='cannot re-run command, nothing recorded') return last_commit_msg = ds.repo.repo.head.commit.message cmdrun_regex = r'\[DATALAD RUNCMD\] (.*)=== Do not change lines below ===\n(.*)\n\^\^\^ Do not change lines above \^\^\^' runinfo = re.match(cmdrun_regex, last_commit_msg, re.MULTILINE | re.DOTALL) if not runinfo: yield dict( err_info, status='impossible', message= 'cannot re-run command, last saved state does not look like a recorded command run' ) return rec_msg, runinfo = runinfo.groups() if message is None: # re-use commit message, if nothing new was given message = rec_msg try: runinfo = json.loads(runinfo) except Exception as e: yield dict( err_info, status='error', message= ('cannot re-run command, command specification is not valid JSON: %s', e.message)) return if 'cmd' not in runinfo: yield dict( err_info, status='error', message= 'cannot re-run command, command specification missing in recorded state' ) return cmd = runinfo['cmd'] rec_exitcode = runinfo.get('exit', 0) rel_pwd = runinfo.get('pwd', None) if rel_pwd: # recording is relative to the dataset pwd = normpath(opj(ds.path, rel_pwd)) else: rel_pwd = None # normalize, just in case pwd = None # now we have to find out what was modified during the last run, and enable re-modification # ideally, we would bring back the entire state of the tree with #1424, but we limit ourself # to file addition/not-in-place-modification for now to_unlock = [] for r in ds.diff(recursive=True, revision='HEAD~1...HEAD', return_type='generator', result_renderer=None): if r.get('type', None) == 'file' and \ r.get('state', None) in ('added', 'modified'): r.pop('status', None) to_unlock.append(r) if to_unlock: for r in ds.unlock(to_unlock, return_type='generator', result_xfm=None): yield r else: # not a rerun, figure out where we are running pwd = ds.path rel_pwd = curdir # anticipate quoted compound shell commands cmd = cmd[0] if isinstance(cmd, list) and len(cmd) == 1 else cmd # TODO do our best to guess which files to unlock based on the command string # in many cases this will be impossible (but see --rerun). however, # generating new data (common case) will be just fine already # we have a clean dataset, let's run things cmd_exitcode = None runner = Runner(cwd=pwd) try: lgr.info("== Command start (output follows) =====") runner.run( cmd, # immediate output log_online=True, # not yet sure what we should do with the command output # IMHO `run` itself should be very silent and let the command talk log_stdout=False, log_stderr=False, expect_stderr=True, expect_fail=True, # TODO stdin ) except CommandError as e: # strip our own info from the exception. The original command output # went to stdout/err -- we just have to exitcode in the same way cmd_exitcode = e.code if not rerun or rec_exitcode != cmd_exitcode: # we failed during a fresh run, or in a different way during a rerun # the latter can easily happen if we try to alter a locked file # # let's fail here, the command could have had a typo or some # other undesirable condition. If we would `add` nevertheless, # we would need to rerun and aggregate annex content that we # likely don't want # TODO add switch to ignore failure (some commands are stupid) # TODO add the ability to `git reset --hard` the dataset tree on failure # we know that we started clean, so we could easily go back, needs gh-1424 # to be able to do it recursively raise CommandError(code=cmd_exitcode) lgr.info("== Command exit (modification check follows) =====") # ammend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, 'exit': cmd_exitcode if cmd_exitcode is not None else 0, } if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd # compose commit message cmd_shorty = (' '.join(cmd) if isinstance(cmd, list) else cmd) cmd_shorty = '{}{}'.format(cmd_shorty[:40], '...' if len(cmd_shorty) > 40 else '') msg = '[DATALAD RUNCMD] {}\n\n=== Do not change lines below ===\n{}\n^^^ Do not change lines above ^^^'.format( message if message is not None else cmd_shorty, json.dumps(run_info, indent=1), sort_keys=True, ensure_ascii=False, encoding='utf-8') for r in ds.add('.', recursive=True, message=msg): yield r
def diff_dataset(dataset, fr, to, constant_refs, path=None, annex=None, untracked='normal', recursive=False, recursion_limit=None, eval_file_type=True, reporting_order='depth-first'): """Internal helper to diff a dataset Parameters ---------- dataset : Dataset Dataset to perform the diff on. `fr` and `to` parameters are interpreted in the context of this dataset. fr : str Commit-ish to compare from. to : str Commit-ish to compare to. constant_refs : bool If True, `fr` and `to` will be passed on unmodified to diff operations on subdatasets. This can be useful with symbolic references like tags to report subdataset changes independent of superdataset changes. If False, `fr` and `to` will be translated to the subdataset commit-ish that match the given commit-ish in the superdataset. path : Path-like, optional Paths to constrain the diff to (see main diff() command). annex : str, optional Reporting mode for annex properties (see main diff() command). untracked : str, optional Reporting mode for untracked content (see main diff() command). recursive : bool, optional Flag to enable recursive operation (see main diff() command). recursion_limit : int, optional Recursion limit (see main diff() command). eval_file_type : bool, optional Whether to perform file type discrimination between real symlinks and symlinks representing annex'ed files. This can be expensive in datasets with many files. reporting_order : {'depth-first', 'breadth-first'}, optional By default, subdataset content records are reported after the record on the subdataset's submodule in a superdataset (depth-first). Alternatively, report all superdataset records first, before reporting any subdataset content records (breadth-first). Yields ------ dict DataLad result records. """ if reporting_order not in ('depth-first', 'breadth-first'): raise ValueError('Unknown reporting order: {}'.format(reporting_order)) ds = require_dataset(dataset, check_installed=True, purpose='difference reporting') # we cannot really perform any sorting of paths into subdatasets # or rejecting paths based on the state of the filesystem, as # we need to be able to compare with states that are not represented # in the worktree (anymore) if path: ps = [] # sort any path argument into the respective subdatasets for p in sorted(assure_list(path)): # it is important to capture the exact form of the # given path argument, before any normalization happens # distinguish rsync-link syntax to identify # a dataset as whole (e.g. 'ds') vs its # content (e.g. 'ds/') # special case is the root dataset, always report its content # changes orig_path = str(p) resolved_path = resolve_path(p, dataset) p = \ resolved_path, \ orig_path.endswith(op.sep) or resolved_path == ds.pathobj str_path = str(p[0]) root = get_dataset_root(str_path) if root is None: # no root, not possibly underneath the refds yield dict(action='status', path=str_path, refds=ds.path, status='error', message='path not underneath this dataset', logger=lgr) continue if path_under_rev_dataset(ds, str_path) is None: # nothing we support handling any further # there is only a single refds yield dict( path=str_path, refds=ds.path, action='diff', status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", ds, str_path), logger=lgr, ) continue ps.append(p) path = ps # TODO we might want to move away from the single-pass+immediate-yield # paradigm for this command. If we gather all information first, we # could do post-processing and detect when a file (same gitsha, or same # key) was copied/moved from another dataset. Another command (e.g. # save) could act on this information and also move/copy # availability information or at least enhance the respective commit # message with cross-dataset provenance info # cache to help avoid duplicate status queries content_info_cache = {} for res in _diff_ds( ds, fr, to, constant_refs, recursion_limit if recursion_limit is not None and recursive else -1 if recursive else 0, # TODO recode paths to repo path reference origpaths=None if not path else OrderedDict(path), untracked=untracked, annexinfo=annex, eval_file_type=eval_file_type, cache=content_info_cache, order=reporting_order): res.update( refds=ds.path, logger=lgr, action='diff', ) yield res
def get_paths_by_dataset(paths, recursive=False, recursion_limit=None, out=None, dir_lookup=None): """Sort a list of paths per dataset they are contained in. Any paths that are not part of a dataset, or presently unavailable are reported. Parameter --------- paths : sequence A sequence of path specifications to sort. recursive : bool Flag whether to report subdatasets under any of the given paths recursion_limit : Depth constraint for recursion. See `Dataset.get_subdatasets()` for more information. out : dict or None By default a new output dictionary is created, however an existing one can be provided via this argument to enable incremental processing. dir_lookup : dict or None Optional lookup cache that maps paths to previously determined datasets. This can speed up repeated processing. Returns ------- Tuple(dict, list, list) Dict of `existing dataset path`: `path` mappings, the list of currently non-existing paths (possibly matching currently uninstalled datasets), and any paths that are not part of any dataset. """ # sort paths into the respective datasets if dir_lookup is None: dir_lookup = {} if out is None: out = {} # paths that don't exist (yet) unavailable_paths = [] nondataset_paths = [] for path in paths: if not lexists(path): # not there yet, impossible to say which ds it will actually # be in, if any unavailable_paths.append(path) continue # the path exists in some shape or form if isdir(path): # this could contain all types of additional content d = path else: # for everything else we are interested in the container d = dirname(path) if not d: d = curdir # this could be `None` if there is no git repo dspath = dir_lookup.get(d, get_dataset_root(d)) dir_lookup[d] = dspath if not dspath: nondataset_paths.append(path) continue if isdir(path): ds = Dataset(dspath) # we need to doublecheck that this is not a subdataset mount # point, in which case get_toppath() would point to the parent smpath = ds.get_containing_subdataset( path, recursion_limit=1).path if smpath != dspath: # fix entry dir_lookup[d] = smpath # submodule still needs to be obtained unavailable_paths.append(path) continue if recursive: # make sure we get everything relevant in all _checked out_ # subdatasets, obtaining of previously unavailable subdataset # else done elsewhere subs = ds.get_subdatasets(fulfilled=True, recursive=recursive, recursion_limit=recursion_limit) for sub in subs: subdspath = opj(dspath, sub) if subdspath.startswith(_with_sep(path)): # this subdatasets is underneath the search path # we want it all # be careful to not overwrite anything, in case # this subdataset has been processed before out[subdspath] = out.get( subdspath, [subdspath]) out[dspath] = out.get(dspath, []) + [path] return out, unavailable_paths, nondataset_paths
def get_superdataset(self, datalad_only=False, topmost=False, registered_only=True): """Get the dataset's superdataset Parameters ---------- datalad_only : bool, optional Whether to consider only "datalad datasets" (with non-None id), or (if False, which is default) - any git repository topmost : bool, optional Return the topmost super-dataset. Might then be the current one. registered_only : bool, optional Test whether any discovered superdataset actually contains the dataset in question as a registered subdataset (as opposed to just being located in a subdirectory without a formal relationship). Returns ------- Dataset or None """ from datalad.coreapi import subdatasets # TODO: return only if self is subdataset of the superdataset # (meaning: registered as submodule)? path = self.path sds_path = path if topmost else None while path: # normalize the path after adding .. so we guaranteed to not # follow into original directory if path itself is a symlink par_path = normpath(opj(path, pardir)) sds_path_ = get_dataset_root(par_path) if sds_path_ is None: # no more parents, use previous found break sds = Dataset(sds_path_) if datalad_only: # test if current git is actually a dataset? if not sds.id: break if registered_only: if path not in sds.subdatasets( recursive=False, contains=path, result_xfm='paths'): break # That was a good candidate sds_path = sds_path_ path = par_path if not topmost: # no looping break if sds_path is None: # None was found return None # No postprocessing now should be necessary since get_toppath # tries its best to not resolve symlinks now return Dataset(sds_path)
def __call__( path=None, initopts=None, force=False, description=None, dataset=None, no_annex=_NoAnnexDefault, annex=True, fake_dates=False, cfg_proc=None ): # TODO: introduced with 0.13, remove with 0.14 if no_annex is not _NoAnnexDefault: # the two mirror options do not agree and the deprecated one is # not at default value warnings.warn("datalad-create's `no_annex` option is deprecated " "and will be removed in a future release, " "use the reversed-sign `annex` option instead.", DeprecationWarning) # honor the old option for now annex = not no_annex # we only perform negative tests below no_annex = not annex if dataset: if isinstance(dataset, Dataset): ds = dataset else: ds = Dataset(dataset) refds_path = ds.path else: ds = refds_path = None # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD # sanity check first if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if (isinstance(initopts, (list, tuple)) and '--bare' in initopts) or ( isinstance(initopts, dict) and 'bare' in initopts): raise ValueError( "Creation of bare repositories is not supported. Consider " "one of the create-sibling commands, or use " "Git to init a bare repository and push an existing dataset " "into it.") if path: path = resolve_path(path, dataset) path = path if path \ else getpwd() if ds is None \ else refds_path # we know that we need to create a dataset at `path` assert(path is not None) # assure cfg_proc is a list (relevant if used via Python API) cfg_proc = assure_list(cfg_proc) # prep for yield res = dict(action='create', path=str(path), logger=lgr, type='dataset', refds=refds_path) refds = None if refds_path and refds_path != str(path): refds = require_dataset( refds_path, check_installed=True, purpose='creating a subdataset') path_inrefds = path_under_rev_dataset(refds, path) if path_inrefds is None: yield dict( res, status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", ds, str(path)), ) return # try to locate an immediate parent dataset # we want to know this (irrespective of whether we plan on adding # this new dataset to a parent) in order to avoid conflicts with # a potentially absent/uninstalled subdataset of the parent # in this location # it will cost some filesystem traversal though... parentds_path = get_dataset_root( op.normpath(op.join(str(path), os.pardir))) if parentds_path: prepo = GitRepo(parentds_path) parentds_path = ut.Path(parentds_path) # we cannot get away with a simple # GitRepo.get_content_info(), as we need to detect # uninstalled/added subdatasets too check_path = ut.Path(path) pstatus = prepo.status( untracked='no', # limit query to target path for a potentially massive speed-up paths=[check_path.relative_to(parentds_path)]) if (not pstatus.get(check_path, {}).get("type") == "dataset" and any(check_path == p or check_path in p.parents for p in pstatus)): # redo the check in a slower fashion, it is already broken # let's take our time for a proper error message conflict = [ p for p in pstatus if check_path == p or check_path in p.parents] res.update({ 'status': 'error', 'message': ( 'collision with content in parent dataset at %s: %s', str(parentds_path), [str(c) for c in conflict])}) yield res return if not force: # another set of check to see whether the target path is pointing # into a known subdataset that is not around ATM subds_status = { parentds_path / k.relative_to(prepo.path) for k, v in pstatus.items() if v.get('type', None) == 'dataset'} check_paths = [check_path] check_paths.extend(check_path.parents) if any(p in subds_status for p in check_paths): conflict = [p for p in check_paths if p in subds_status] res.update({ 'status': 'error', 'message': ( 'collision with %s (dataset) in dataset %s', str(conflict[0]), str(parentds_path))}) yield res return # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = ds if isinstance(ds, Dataset) and \ ds.path == path else Dataset(str(path)) # don't create in non-empty directory without `force`: if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force: res.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`force` option to ignore'}) yield res return # stuff that we create and want to have tracked with git (not annex) add_to_git = {} if initopts is not None and isinstance(initopts, list): initopts = {'_from_cmdline_': initopts} # Note for the code below: # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad # Re-use tbrepo instance, do not use tbds.repo # create and configure desired repository if no_annex: lgr.info("Creating a new git repo at %s", tbds.path) tbrepo = GitRepo( tbds.path, url=None, create=True, create_sanity_checks=False, git_opts=initopts, fake_dates=fake_dates) # place a .noannex file to indicate annex to leave this repo alone stamp_path = ut.Path(tbrepo.path) / '.noannex' stamp_path.touch() add_to_git[stamp_path] = { 'type': 'file', 'state': 'untracked'} else: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", tbds.path) tbrepo = AnnexRepo( tbds.path, url=None, create=True, create_sanity_checks=False, # do not set backend here, to avoid a dedicated commit backend=None, # None causes version to be taken from config version=None, description=description, git_opts=initopts, fake_dates=fake_dates ) # set the annex backend in .gitattributes as a staged change tbrepo.set_default_backend( cfg.obtain('datalad.repo.backend'), persistent=True, commit=False) add_to_git[tbrepo.pathobj / '.gitattributes'] = { 'type': 'file', 'state': 'added'} # make sure that v6 annex repos never commit content under .datalad attrs_cfg = ( ('config', 'annex.largefiles', 'nothing'), ('metadata/aggregate*', 'annex.largefiles', 'nothing'), ('metadata/objects/**', 'annex.largefiles', '({})'.format(cfg.obtain( 'datalad.metadata.create-aggregate-annex-limit')))) attrs = tbrepo.get_gitattributes( [op.join('.datalad', i[0]) for i in attrs_cfg]) set_attrs = [] for p, k, v in attrs_cfg: if not attrs.get( op.join('.datalad', p), {}).get(k, None) == v: set_attrs.append((p, {k: v})) if set_attrs: tbrepo.set_gitattributes( set_attrs, attrfile=op.join('.datalad', '.gitattributes')) # prevent git annex from ever annexing .git* stuff (gh-1597) attrs = tbrepo.get_gitattributes('.git') if not attrs.get('.git', {}).get( 'annex.largefiles', None) == 'nothing': tbrepo.set_gitattributes([ ('**/.git*', {'annex.largefiles': 'nothing'})]) # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbrepo.pathobj / '.gitattributes'] = { 'type': 'file', 'state': 'untracked'} # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad # Note, must not happen earlier (before if) since "smart" it would not be tbds_config = tbds.config # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' # Note, that Dataset property `id` will change when we unset the # respective config. Therefore store it before: tbds_id = tbds.id if id_var in tbds_config: # make sure we reset this variable completely, in case of a # re-create tbds_config.unset(id_var, where='dataset') if _seed is None: # just the standard way uuid_id = uuid.uuid1().urn.split(':')[-1] else: # Let's generate preseeded ones uuid_id = str(uuid.UUID(int=random.getrandbits(128))) tbds_config.add( id_var, tbds_id if tbds_id is not None else uuid_id, where='dataset', reload=False) # make config overrides permanent in the repo config # this is similar to what `annex init` does # we are only doing this for config overrides and do not expose # a dedicated argument, because it is sufficient for the cmdline # and unnecessary for the Python API (there could simply be a # subsequence ds.config.add() call) for k, v in tbds_config.overrides.items(): tbds_config.add(k, v, where='local', reload=False) # all config manipulation is done -> fll reload tbds_config.reload() # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbrepo.pathobj / '.datalad'] = { 'type': 'directory', 'state': 'untracked'} # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbrepo.save( message='[DATALAD] new dataset', git=True, # we have to supply our own custom status, as the repo does # not have a single commit yet and the is no HEAD reference # TODO make `GitRepo.status()` robust to this state. _status=add_to_git, ) for cfg_proc_ in cfg_proc: for r in tbds.run_procedure('cfg_' + cfg_proc_): yield r # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if isinstance(refds, Dataset) and refds.path != tbds.path: # we created a dataset in another dataset # -> make submodule for r in refds.save( path=tbds.path, ): yield r res.update({'status': 'ok'}) yield res
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)") # prep common result props res_kwargs = dict( action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive( refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = assure_list(path) if modified is not None: # modification detection would silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = resolve_path(p, ds=refds_path) if path_startswith(p, refds_path): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[r] = res yield res # preserve non-existing paths to be silently killed by modification # detection and append them to requested_paths again after detection. # TODO: This might be melted in with treatment of non dataset paths # above. Re-appending those paths seems to be better than yielding # directly to avoid code duplication, since both cases later on are # dealt with again. preserved_paths = [] if requested_paths: [preserved_paths.append(r) for r in requested_paths if not lexists(r['path'] if isinstance(r, dict) else r)] # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) from itertools import chain # re-append the preserved paths: requested_paths = chain(requested_paths, iter(preserved_paths)) # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path if not islink(path) else normpath(opj(path, pardir)) else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or ( refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root(normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not path_startswith(dspath, refds_path): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) subdss = containing_ds.subdatasets( fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get( 'status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset(parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)") # prep common result props res_kwargs = dict( action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive( refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = assure_list(path) if modified is not None: # modification detection would silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = resolve_path(p, ds=refds_path) if _with_sep(p).startswith(_with_sep(refds_path)): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[r] = res yield res # preserve non-existing paths to be silently killed by modification # detection and append them to requested_paths again after detection. # TODO: This might be melted in with treatment of non dataset paths # above. Re-appending those paths seems to be better than yielding # directly to avoid code duplication, since both cases later on are # dealt with again. preserved_paths = [] if requested_paths: [preserved_paths.append(r) for r in requested_paths if not lexists(r['path'] if isinstance(r, dict) else r)] # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) from itertools import chain # re-append the preserved paths: requested_paths = chain(requested_paths, iter(preserved_paths)) # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or ( refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root(normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not _with_sep(dspath).startswith(_with_sep(refds_path)): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) subdss = containing_ds.subdatasets( fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get( 'status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset(parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return
def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, description=None, reckless=False, #git_opts=None, #annex_opts=None, #annex_get_opts=None, jobs='auto', verbose=False, ): # IMPLEMENTATION CONCEPT: # # 1. Sort the world into existing handles and the rest # 2. Try locate missing handles (obtain subdatasets along the way) # 3. Expand into subdatasets with recursion enables (potentially # obtain even more subdatasets # 4. Shoot info of which handles to get in each subdataset to, # git-annex, once at the very end refds_path = Interface.get_refds_path(dataset) if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path # remember which results we already reported, to avoid duplicates yielded_ds = [] to_get = [] unavailable_paths = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='get', # NOTE: Do not act upon unavailable paths yet! Done below after # testing which ones could be obtained unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('state', None) == 'absent' and ap.get( 'raw_input', False): # if this wasn't found, but directly requested, queue for further # exploration unavailable_paths.append(ap) continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: # do not report what hasn't arived yet # also do not report the base dataset that is already # present -- no surprise yield dict(ap, status='notneeded', logger=lgr, message='already installed') yielded_ds.append(ap['path']) ap['process_content'] = get_data to_get.append(ap) # explore the unknown for ap in sorted(unavailable_paths, key=lambda x: x['path']): lgr.debug("Investigate yet unavailable path %s", ap) # how close can we get? dspath = ap.get('parentds', get_dataset_root(ap['path'])) if dspath is None: # nothing we can do for this path continue lgr.debug("Found containing dataset %s for path %s", dspath, ap['path']) ds = Dataset(dspath) # now actually obtain whatever is necessary to get to this path containing_ds = [dspath] for res in _install_necessary_subdatasets(ds, ap['path'], reckless, refds_path, description=description): # yield immediately so errors could be acted upon outside, before # we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record, recursive below might now want to report # a 'notneeded' yielded_ds.append(res['path']) yield res # update to the current innermost dataset containing_ds.append(res['path']) if len(containing_ds) < 2: # no subdataset was installed, hence if the path was unavailable # before it still is, no need to bother git annex ap.update(status='impossible', message='path does not exist') yield ap continue # important to only do the next for the innermost subdataset # as the `recursive` logic below relies on that! # set the correct parent, for a dataset this would be the second-last # reported subdataset ap.update(parentds=containing_ds[-1]) if containing_ds[-1] == ap['path']: # the path actually refers to the last installed dataset ap.update(parentds=containing_ds[-2], process_content=get_data, type='dataset') to_get.append(ap) # results of recursive installation of yet undiscovered datasets rec_get = [] if recursive and not recursion_limit == 'existing': # obtain any subdatasets underneath the paths given inside the # subdatasets that we know already exist # unless we do not want recursion into not-yet-installed datasets for ap in sorted(to_get, key=lambda x: x['path']): if ap['type'] not in ('dataset', 'directory') or not ap.get( 'raw_input', False): # a non-directory cannot have content underneath # also we do NOT want to recurse into anything that was specifically # requested, to avoid duplication continue subds = Dataset(ap['path'] if ap['type'] == 'dataset' else ap['parentds']) lgr.info("Installing %s%s recursively", subds, (" underneath %s" % ap['path'] if subds.path != ap['path'] else "")) for res in _recursive_install_subds_underneath( subds, # `ap['path']` was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, start=ap['path'], refds_path=refds_path, description=description): # yield immediately so errors could be acted upon # outside, before we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record yielded_ds.append(res['path']) yield res if not (res['status'] == 'ok' and res['type'] == 'dataset'): # not a dataset that was just installed, we just reported it # upstairs, and can ignore it from now on continue # paranoia, so popular these days... assert GitRepo.is_valid_repo(res['path']) # keep a copy of the install record for `get` later on get_ap = { k: v for k, v in res.items() if not k == 'status' } get_ap['process_content'] = get_data rec_get.append(get_ap) if not get_data: # done already return # merge the two AP lists to_get.extend(rec_get) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_get, refds_path=refds_path) assert (not completed) # hand over to git-annex, get files content, # report files in git as 'notneeded' to get for ds_path in sorted(content_by_ds.keys()): ds = Dataset(ds_path) # grab content, ignore subdataset entries content = [ ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path ] if not content: # cut this short should there be nothing continue # needs to be an annex to get content if not isinstance(ds.repo, AnnexRepo): for r in results_from_paths( content, status='notneeded', message="no dataset annex, content already present", action='get', logger=lgr, refds=refds_path): yield r continue respath_by_status = {} for res in ds.repo.get(content, options=['--from=%s' % source] if source else [], jobs=jobs): res = annexjson2result(res, ds, type='file', logger=lgr, refds=refds_path) success = success_status_map[res['status']] # TODO: in case of some failed commands (e.g. get) there might # be no path in the record. yoh has only vague idea of logic # here so just checks for having 'path', but according to # results_from_annex_noinfo, then it would be assumed that # `content` was acquired successfully, which is not the case if 'path' in res: respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res for r in results_from_annex_noinfo( ds, content, respath_by_status, dir_fail_msg='could not get some content in %s %s', noinfo_dir_msg='nothing to get from %s', noinfo_file_msg='already present', action='get', logger=lgr, refds=refds_path): yield r
def __call__( path=None, dataset=None, recursive=False, check=True, if_dirty='save-before'): if dataset: dataset = require_dataset( dataset, check_installed=False, purpose='removal') if not dataset.is_installed() and not path: # all done already return [] if not path: # act on the whole dataset if nothing else was specified path = dataset.path if isinstance(dataset, Dataset) else dataset content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive) nonexistent_paths = [] for p in unavailable_paths: # we need to check whether any of these correspond # to a known subdataset, and add those to the list of # things to be removed toppath = get_dataset_root(p) if not toppath: nonexistent_paths.append(p) continue if p in Dataset(toppath).get_subdatasets( recursive=False, absolute=True): # this is a known subdataset that needs to be removed pl = content_by_ds.get(p, []) pl.append(p) content_by_ds[p] = pl if nonexistent_paths: lgr.warning("ignoring non-existent path(s): %s", nonexistent_paths) if path_is_under(content_by_ds): # behave like `rm` and refuse to remove where we are raise ValueError( "refusing to uninstall current or parent directory") handle_dirty_datasets( content_by_ds, mode=if_dirty, base=dataset) ds2save = set() results = [] # iterate over all datasets, starting at the bottom # to make the removal of dataset content known upstairs for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) paths = content_by_ds[ds_path] if ds_path in paths: # entire dataset needs to go superds = ds.get_superdataset( datalad_only=False, topmost=False) res = _uninstall_dataset(ds, check=check, has_super=False) results.extend(res) if ds.path in ds2save: # we just uninstalled it, no need to save anything ds2save.discard(ds.path) if not superds: continue subds_relpath = relpath(ds_path, start=superds.path) # remove submodule reference submodule = [sm for sm in superds.repo.repo.submodules if sm.path == subds_relpath] # there can only be one! assert(len(submodule) == 1) submodule = submodule[0] submodule.remove() if exists(ds_path): # could be an empty dir in case an already uninstalled subdataset # got removed os.rmdir(ds_path) # need to save changes to .gitmodules later content_by_ds[superds.path] = \ content_by_ds.get(superds.path, []) \ + [opj(superds.path, '.gitmodules'), ds_path] ds2save.add(superds.path) else: if check and hasattr(ds.repo, 'drop'): _drop_files(ds, paths, check=True) results.extend(ds.repo.remove(paths, r=True)) ds2save.add(ds.path) if dataset and dataset.is_installed(): # forge chain from base dataset to any leaf dataset # in order to save state changes all the way up _discover_trace_to_known(dataset.path, [], content_by_ds) save_dataset_hierarchy( content_by_ds, base=dataset.path if dataset and dataset.is_installed() else None, message='[DATALAD] removed content') return results
def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, description=None, reckless=False, #git_opts=None, #annex_opts=None, #annex_get_opts=None, jobs='auto', verbose=False, ): # IMPLEMENTATION CONCEPT: # # 1. Sort the world into existing handles and the rest # 2. Try locate missing handles (obtain subdatasets along the way) # 3. Expand into subdatasets with recursion enables (potentially # obtain even more subdatasets # 4. Shoot info of which handles to get in each subdataset to, # git-annex, once at the very end refds_path = Interface.get_refds_path(dataset) if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path # remember which results we already reported, to avoid duplicates yielded_ds = [] to_get = [] unavailable_paths = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='get', # NOTE: Do not act upon unavailable paths yet! Done below after # testing which ones could be obtained unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('state', None) == 'absent' and ap.get('raw_input', False): # if this wasn't found, but directly requested, queue for further # exploration unavailable_paths.append(ap) continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: # do not report what hasn't arived yet # also do not report the base dataset that is already # present -- no surprise yield dict(ap, status='notneeded', logger=lgr, message='already installed') yielded_ds.append(ap['path']) ap['process_content'] = get_data to_get.append(ap) # explore the unknown for ap in sorted(unavailable_paths, key=lambda x: x['path']): lgr.debug("Investigate yet unavailable path %s", ap) # how close can we get? dspath = ap.get('parentds', get_dataset_root(ap['path'])) if dspath is None: # nothing we can do for this path continue lgr.debug("Found containing dataset %s for path %s", dspath, ap['path']) ds = Dataset(dspath) # now actually obtain whatever is necessary to get to this path containing_ds = [dspath] for res in _install_necessary_subdatasets( ds, ap['path'], reckless, refds_path, description=description): # yield immediately so errors could be acted upon outside, before # we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record, recursive below might now want to report # a 'notneeded' yielded_ds.append(res['path']) yield res # update to the current innermost dataset containing_ds.append(res['path']) if len(containing_ds) < 2: # no subdataset was installed, hence if the path was unavailable # before it still is, no need to bother git annex ap.update(status='impossible', message='path does not exist') yield ap continue # important to only do the next for the innermost subdataset # as the `recursive` logic below relies on that! # set the correct parent, for a dataset this would be the second-last # reported subdataset ap.update(parentds=containing_ds[-1]) if containing_ds[-1] == ap['path']: # the path actually refers to the last installed dataset ap.update(parentds=containing_ds[-2], process_content=get_data, type='dataset') to_get.append(ap) # results of recursive installation of yet undiscovered datasets rec_get = [] if recursive and not recursion_limit == 'existing': # obtain any subdatasets underneath the paths given inside the # subdatasets that we know already exist # unless we do not want recursion into not-yet-installed datasets for ap in sorted(to_get, key=lambda x: x['path']): if ap['type'] not in ('dataset', 'directory') or not ap.get('raw_input', False): # a non-directory cannot have content underneath # also we do NOT want to recurse into anything that was specifically # requested, to avoid duplication continue subds = Dataset(ap['path'] if ap['type'] == 'dataset' else ap['parentds']) lgr.info( "Installing %s%s recursively", subds, (" underneath %s" % ap['path'] if subds.path != ap['path'] else "")) for res in _recursive_install_subds_underneath( subds, # `ap['path']` was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, start=ap['path'], refds_path=refds_path, description=description): # yield immediately so errors could be acted upon # outside, before we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record yielded_ds.append(res['path']) yield res if not (res['status'] == 'ok' and res['type'] == 'dataset'): # not a dataset that was just installed, we just reported it # upstairs, and can ignore it from now on continue # paranoia, so popular these days... assert GitRepo.is_valid_repo(res['path']) # keep a copy of the install record for `get` later on get_ap = {k: v for k, v in res.items() if not k == 'status'} get_ap['process_content'] = get_data rec_get.append(get_ap) if not get_data: # done already return # merge the two AP lists to_get.extend(rec_get) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_get, refds_path=refds_path) assert(not completed) # hand over to git-annex, get files content, # report files in git as 'notneeded' to get for ds_path in sorted(content_by_ds.keys()): ds = Dataset(ds_path) # grab content, ignore subdataset entries content = [ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path] if not content: # cut this short should there be nothing continue # needs to be an annex to get content if not isinstance(ds.repo, AnnexRepo): for r in results_from_paths( content, status='notneeded', message="no dataset annex, content already present", action='get', logger=lgr, refds=refds_path): yield r continue respath_by_status = {} for res in ds.repo.get( content, options=['--from=%s' % source] if source else [], jobs=jobs): res = annexjson2result(res, ds, type='file', logger=lgr, refds=refds_path) success = success_status_map[res['status']] # TODO: in case of some failed commands (e.g. get) there might # be no path in the record. yoh has only vague idea of logic # here so just checks for having 'path', but according to # results_from_annex_noinfo, then it would be assumed that # `content` was acquired successfully, which is not the case if 'path' in res: respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res for r in results_from_annex_noinfo( ds, content, respath_by_status, dir_fail_msg='could not get some content in %s %s', noinfo_dir_msg='nothing to get from %s', noinfo_file_msg='already present', action='get', logger=lgr, refds=refds_path): yield r
def from_config_files(cls, files=None, reload=False): """Loads information about related/possible websites requiring authentication from: - datalad/downloaders/configs/*.cfg files provided by the codebase - current dataset .datalad/providers/ - User's home directory directory (ie ~/.config/datalad/providers/*.cfg) - system-wide datalad installation/config (ie /etc/datalad/providers/*.cfg) For sample configs files see datalad/downloaders/configs/providers.cfg If files is None, loading is cached between calls. Specify reload=True to force reloading of files from the filesystem. The class method reset_default_providers can also be called to reset the cached providers. """ # lazy part dsroot = get_dataset_root("") if files is None and cls._DEFAULT_PROVIDERS and not reload and dsroot == cls._DS_ROOT: return cls._DEFAULT_PROVIDERS config = SafeConfigParserWithIncludes() files_orig = files if files is None: cls._DS_ROOT = dsroot files = [] for p in cls._get_providers_dirs(dsroot).values(): files.extend(cls._get_configs(p)) config.read(files) # We need first to load Providers and credentials # Order matters, because we need to ensure that when # there's a conflict between configuration files declared # at different precedence levels (ie. dataset vs system) # the appropriate precedence config wins. providers = OrderedDict() credentials = {} for section in config.sections(): if ':' in section: type_, name = section.split(':', 1) assert type_ in { 'provider', 'credential' }, "we know only providers and credentials, got type %s" % type_ items = { o: config.get(section, o) for o in config.options(section) } # side-effect -- items get popped locals().get(type_ + "s")[name] = getattr( cls, '_process_' + type_)(name, items) if len(items): raise ValueError("Unprocessed fields left for %s: %s" % (name, str(items))) else: lgr.warning("Do not know how to treat section %s here" % section) # link credentials into providers lgr.debug("Assigning credentials into %d providers", len(providers)) for provider in providers.values(): if provider.credential: if provider.credential not in credentials: raise ValueError( "Unknown credential %s. Known are: %s" % (provider.credential, ", ".join(credentials.keys()))) provider.credential = credentials[provider.credential] # TODO: Is this the right place to pass dataset to credential? provider.credential.set_context(dataset=cls._DS_ROOT) providers = Providers(list(providers.values())) if files_orig is None: # Store providers for lazy access cls._DEFAULT_PROVIDERS = providers return providers
def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, reckless=False, git_opts=None, annex_opts=None, annex_get_opts=None, jobs=None, verbose=False, # internal -- instead of returning 'get'ed items, return final # content_by_ds, unavailable_paths. To be used by the call from # Install.__call__ and done so to avoid creating another reusable # function which would need to duplicate all this heavy list of # kwargs _return_datasets=False ): # IMPLEMENTATION CONCEPT: # # 1. Sort the world into existing handles and the rest # 2. Try locate missing handles (obtain subdatasets along the way) # 3. Expand into subdatasets with recursion enables (potentially # obtain even more subdatasets # 4. Shoot info of which handles to get in each subdataset to, # git-annex, once at the very end dataset_path = dataset.path if isinstance(dataset, Dataset) else dataset if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = dataset_path # use lookup cache -- we need that info further down dir_lookup = {} content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive, recursion_limit=recursion_limit, dir_lookup=dir_lookup) # explore the unknown for path in sorted(unavailable_paths): # how close can we get? dspath = get_dataset_root(path) if dspath is None: # nothing we can do for this path continue ds = Dataset(dspath) # must always yield a dataset -- we sorted out the ones outside # any dataset at the very top assert ds.is_installed() # now actually obtain whatever is necessary to get to this path containing_ds = install_necessary_subdatasets(ds, path, reckless) if containing_ds.path != ds.path: lgr.debug("Installed %s to fulfill request for content for " "path %s", containing_ds, path) # mark resulting dataset as auto-installed if containing_ds.path == path: # we had to get the entire dataset, not something within # mark that it just appeared content_by_ds[path] = [curdir] else: # we need to get content within content_by_ds[path] = [path] if recursive and not recursion_limit == 'existing': # obtain any subdatasets underneath the paths given inside the # subdatasets that we know already exist # unless we do not want recursion into not-yet-installed datasets for subdspath in sorted(content_by_ds.keys()): for content_path in content_by_ds[subdspath]: if not isdir(content_path): # a non-directory cannot have content underneath continue subds = Dataset(subdspath) lgr.info( "Obtaining %s %s recursively", subds, ("underneath %s" % content_path if subds.path != content_path else "")) cbysubds = _recursive_install_subds_underneath( subds, # `content_path` was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, # protect against magic marker misinterpretation # only relevant for _get, hence replace here start=content_path if content_path != curdir else None) # gets file content for all freshly installed subdatasets content_by_ds.update(cbysubds) ## we have now done everything we could to obtain whatever subdataset ## to get something on the file system for previously unavailable paths ## check and sort one last content_by_ds, unavailable_paths, nondataset_paths = \ get_paths_by_dataset( unavailable_paths, recursive=recursive, recursion_limit=recursion_limit, out=content_by_ds, dir_lookup=dir_lookup) if nondataset_paths: # XXX likely can never get here lgr.warning( "ignored paths that do not belong to any dataset: %s", nondataset_paths) if unavailable_paths: lgr.warning('ignored non-existing paths: %s', unavailable_paths) # hand over to git-annex results = list(chain.from_iterable( _get(content_by_ds, refpath=dataset_path, source=source, jobs=jobs, get_data=get_data))) # ??? should we in _return_datasets case just return both content_by_ds # and unavailable_paths may be so we provide consistent across runs output # and then issue outside similar IncompleteResultsError? if unavailable_paths: # and likely other error flags if _return_datasets: results = sorted(set(content_by_ds).difference(unavailable_paths)) raise IncompleteResultsError(results, failed=unavailable_paths) else: return sorted(content_by_ds) if _return_datasets else results
def __call__(path=None, initopts=None, *, force=False, description=None, dataset=None, annex=True, fake_dates=False, cfg_proc=None): # we only perform negative tests below no_annex = not annex if dataset: if isinstance(dataset, Dataset): ds = dataset else: ds = Dataset(dataset) refds_path = ds.path else: ds = refds_path = None # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD # sanity check first if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if (isinstance(initopts, (list, tuple)) and '--bare' in initopts) or (isinstance(initopts, dict) and 'bare' in initopts): raise ValueError( "Creation of bare repositories is not supported. Consider " "one of the create-sibling commands, or use " "Git to init a bare repository and push an existing dataset " "into it.") if path: path = resolve_path(path, dataset) path = path if path \ else getpwd() if ds is None \ else refds_path # we know that we need to create a dataset at `path` assert (path is not None) # assure cfg_proc is a list (relevant if used via Python API) cfg_proc = ensure_list(cfg_proc) # prep for yield res = dict(action='create', path=str(path), logger=lgr, type='dataset', refds=refds_path) refds = None if refds_path and refds_path != str(path): refds = require_dataset(refds_path, check_installed=True, purpose='create a subdataset') path_inrefds = path_under_rev_dataset(refds, path) if path_inrefds is None: yield dict( res, status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", ds, str(path)), ) return # try to locate an immediate parent dataset # we want to know this (irrespective of whether we plan on adding # this new dataset to a parent) in order to avoid conflicts with # a potentially absent/uninstalled subdataset of the parent # in this location # it will cost some filesystem traversal though... parentds_path = get_dataset_root( op.normpath(op.join(str(path), os.pardir))) if parentds_path: prepo = GitRepo(parentds_path) parentds_path = Path(parentds_path) # we cannot get away with a simple # GitRepo.get_content_info(), as we need to detect # uninstalled/added subdatasets too check_path = Path(path) pstatus = prepo.status( untracked='no', # limit query to target path for a potentially massive speed-up paths=[check_path.relative_to(parentds_path)]) if (not pstatus.get(check_path, {}).get("type") == "dataset" and any(check_path == p or check_path in p.parents for p in pstatus)): # redo the check in a slower fashion, it is already broken # let's take our time for a proper error message conflict = [ p for p in pstatus if check_path == p or check_path in p.parents ] res.update({ 'status': 'error', 'message': ('collision with content in parent dataset at %s: %s', str(parentds_path), [str(c) for c in conflict]) }) yield res return if not force: # another set of check to see whether the target path is pointing # into a known subdataset that is not around ATM subds_status = { parentds_path / k.relative_to(prepo.path) for k, v in pstatus.items() if v.get('type', None) == 'dataset' } check_paths = [check_path] check_paths.extend(check_path.parents) if any(p in subds_status for p in check_paths): conflict = [p for p in check_paths if p in subds_status] res.update({ 'status': 'error', 'message': ('collision with %s (dataset) in dataset %s', str(conflict[0]), str(parentds_path)) }) yield res return # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = ds if isinstance(ds, Dataset) and \ ds.path == path else Dataset(str(path)) # don't create in non-empty directory without `force`: if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force: res.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`--force` option to ignore' }) yield res return # Check if specified cfg_proc(s) can be discovered, storing # the results so they can be used when the time comes to run # the procedure. If a procedure cannot be found, raise an # error to prevent creating the dataset. cfg_proc_specs = [] if cfg_proc: discovered_procs = tbds.run_procedure( discover=True, result_renderer='disabled', return_type='generator', ) for cfg_proc_ in cfg_proc: for discovered_proc in discovered_procs: if discovered_proc['procedure_name'] == 'cfg_' + cfg_proc_: cfg_proc_specs.append(discovered_proc) break else: raise ValueError("Cannot find procedure with name " "'%s'" % cfg_proc_) if initopts is not None and isinstance(initopts, list): initopts = {'_from_cmdline_': initopts} # Note for the code below: # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad # Re-use tbrepo instance, do not use tbds.repo # create and configure desired repository # also provides initial set of content to be tracked with git (not annex) if no_annex: tbrepo, add_to_git = _setup_git_repo(path, initopts, fake_dates) else: tbrepo, add_to_git = _setup_annex_repo(path, initopts, fake_dates, description) # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad # Note, must not happen earlier (before if) since "smart" it would not be tbds_config = tbds.config # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' # Note, that Dataset property `id` will change when we unset the # respective config. Therefore store it before: tbds_id = tbds.id if id_var in tbds_config: # make sure we reset this variable completely, in case of a # re-create tbds_config.unset(id_var, scope='branch') if _seed is None: # just the standard way # use a fully random identifier (i.e. UUID version 4) uuid_id = str(uuid.uuid4()) else: # Let's generate preseeded ones uuid_id = str(uuid.UUID(int=random.getrandbits(128))) tbds_config.add(id_var, tbds_id if tbds_id is not None else uuid_id, scope='branch', reload=False) # make config overrides permanent in the repo config # this is similar to what `annex init` does # we are only doing this for config overrides and do not expose # a dedicated argument, because it is sufficient for the cmdline # and unnecessary for the Python API (there could simply be a # subsequence ds.config.add() call) for k, v in tbds_config.overrides.items(): tbds_config.add(k, v, scope='local', reload=False) # all config manipulation is done -> fll reload tbds_config.reload() # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbrepo.pathobj / '.datalad'] = { 'type': 'directory', 'state': 'untracked' } # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbrepo.save( message='[DATALAD] new dataset', git=True, # we have to supply our own custom status, as the repo does # not have a single commit yet and the is no HEAD reference # TODO make `GitRepo.status()` robust to this state. _status=add_to_git, ) for cfg_proc_spec in cfg_proc_specs: yield from tbds.run_procedure( cfg_proc_spec, result_renderer='disabled', return_type='generator', ) # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if isinstance(refds, Dataset) and refds.path != tbds.path: # we created a dataset in another dataset # -> make submodule yield from refds.save( path=tbds.path, return_type='generator', result_renderer='disabled', ) res.update({'status': 'ok'}) yield res
def __call__( path=None, dataset=None, # support passing this through in a path by path basis to_git=None, save=True, message=None, message_file=None, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path") refds_path = Interface.get_refds_path(dataset) common_report = dict(action='add', logger=lgr, refds=refds_path) if message and message_file: raise ValueError("Both a message and message file were specified") if message_file: with open(message_file, "rb") as mfh: message = assure_unicode(mfh.read()) to_add = [] subds_to_add = {} ds_to_annotate_from_recursion = {} got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=dataset, # never recursion, need to handle manually below to be able to # discover untracked content recursive=False, action='add', # speed things up by using Git's modification detection, if there # is a repo with at least one commit modified='HEAD' \ if dataset and \ GitRepo.is_valid_repo(refds_path) and \ GitRepo(refds_path).get_hexsha() \ else None, unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): got_nothing = False if ap.get('status', None): # this is done yield ap continue if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset': yield get_status_dict( status='impossible', message='"there is no dataset to add this path to', **dict(common_report, **ap)) continue if ap.get('type', None) == 'directory' and \ ap.get('state', None) == 'untracked' and \ GitRepo.is_valid_repo(ap['path']): # this is an untracked wannabe subdataset in disguise ap['type'] = 'dataset' if recursive and \ (ap.get('raw_input', False) or ap.get('state', None) in ('added', 'modified', 'untracked')) and \ (ap.get('parentds', None) or ap.get('type', None) == 'dataset'): # this was an actually requested input path, or a path that was found # modified by path annotation, based on an input argument # we need to recurse into all subdirs to find potentially # unregistered subdatasets # but only if this path has a parent, or is itself a dataset # otherwise there is nothing to add to _discover_subdatasets_recursively( ds_to_annotate_from_recursion, ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']], recursion_limit) # get the file content of the root dataset of this search added too # but be careful with extreme recursion_limit settings if recursion_limit is None or recursion_limit > 0: ap['process_content'] = True # record for further processing if not ap['path'] in ds_to_annotate_from_recursion: # if it was somehow already discovered to_add.append(ap) if got_nothing: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict( 'add', status='notneeded', path=refds_path, type='dataset', logger=lgr) return for subds in ds_to_annotate_from_recursion: if subds not in subds_to_add: # always prefer the already annotated path subds_to_add[subds] = ds_to_annotate_from_recursion[subds] if dataset: # we have a base dataset, discover any intermediate datasets between # the base and any already discovered dataset discovered = {} discover_dataset_trace_to_targets( # from here dataset.path, # to any dataset we are aware of subds_to_add.keys(), [], discovered) for parentds in discovered: for subds in discovered[parentds]: subds_to_add[subds] = subds_to_add.get( subds, dict(path=subds, parentds=parentds, type='dataset')) # merge custom paths and discovered dataset records, paths needs to go first, # because we know most about then, and subsequent annotation call we skip the # later duplicate ones to_add.extend(subds_to_add.values()) # and compact, this should be OK as all the info is in each ap dict to_add = unique(to_add, lambda x: x['path']) if not to_add: # nothing left to do, potentially all errored before return # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, it will also deduplicate annotated_paths = AnnotatePaths.__call__( path=to_add, dataset=dataset, # never recursion, done already recursive=False, action='add', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert(not completed) if not content_by_ds: # we should have complained about any inappropriate path argument # above, so if nothing is left, we can simply exit return # simple loop over datasets -- save happens later # start deep down to_save = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) torepoadd = {} respath_by_status = {} for ap in content_by_ds[ds_path]: # we have a new story ap.pop('status', None) torepoadd[ap['path']] = ap # skip anything that doesn't look like a wannabe subdataset if not ap.get('type', None) == 'dataset' or \ ap['path'] == ds_path: continue if ap.get('registered_subds', False): # subdataset that might be in this list because of the # need to save all the way up to a super dataset respath_by_status['success'] = \ respath_by_status.get('success', []) + [ap['path']] yield get_status_dict( status='notneeded', message="already known subdataset", **dict(common_report, **ap)) continue subds = Dataset(ap['path']) subds_relpath = relpath(ap['path'], ds_path) # Register the repository in the repo tree as a submodule try: ds.repo.add_submodule(subds_relpath, url=None, name=None) except (CommandError, InvalidGitRepositoryError) as e: yield get_status_dict( ds=subds, status='error', message=e.stderr, **dict(common_report, **ap)) continue # queue for saving using the updated annotated path ap['registered_subds'] = True # I hope this is true in direct mode too # TODO this is disabled, because in some circumstances # staging just doesn't happen, and it is unclear when # exactly -- the case that prompted disabling was a submodule # that had no content except for other submodules was not staged, # whereas another submodule on the same level in the same # superdataset which also has one file in it was staged # disable to work correctly, while paying a little bit of # slow down #ap['staged'] = True to_save.append(ap) # report added subdatasets -- `annex add` below won't do it yield get_status_dict( ds=subds, status='ok', message='added new subdataset', **dict(common_report, **ap)) # make sure that .gitmodules is added to the list of files gitmodules_path = opj(ds.path, '.gitmodules') # for git torepoadd[gitmodules_path] = dict(path=gitmodules_path) # and for save to_save.append(dict( path=gitmodules_path, parentds=ds_path, type='file')) # make sure any last minute additions make it to the saving stage # XXX? should content_by_ds become OrderedDict so that possible # super here gets processed last? lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd) is_annex = isinstance(ds.repo, AnnexRepo) add_kw = {'jobs': jobs} if is_annex and jobs else {} added = ds.repo.add_( list(torepoadd.keys()), git=to_git if is_annex else True, **add_kw ) for a in added: res = annexjson2result(a, ds, type='file', **common_report) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] # produce best possible path/result annotation if res['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report res = dict(torepoadd[res['path']], **res) # override this in all cases to be safe res['parentds'] = ds.path if success: # this was successfully added, queue for saving this very path # in the dataset ap = {k: v for k, v in res.items() if k != 'status'} ap['staged'] = True # strip any status and state info (e.g. save will refuse to save # stuff that is marked state='untracked' to_save.append({k: v for k, v in res.items() if k not in ('status', 'state')}) if a['file'] == '.gitmodules': # filter out .gitmodules, because this is only included for # technical reasons and has nothing to do with the actual content continue if GitRepo.is_valid_repo(res['path']): # more accurate report in case of an added submodule # mountpoint. # XXX Actually not sure if this can really happen # (depends on what our low-level code would do) # but worst case is that we loose a little bit of # coverage... res['type'] = 'dataset' res['message'] = 'added new state as submodule' yield res for r in results_from_annex_noinfo( ds, torepoadd, respath_by_status, dir_fail_msg='could not add some content in %s %s', noinfo_dir_msg='nothing to add from %s', noinfo_file_msg='already included in the dataset', action='add', logger=lgr, refds=refds_path): if r['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report r = dict(r, **torepoadd[r['path']]) if r['status'] == 'notneeded': # this could be a file that was staged already, it doesn't need # to be added, but it should be saved/commited if so desired to_save.append({k: v for k, v in r.items() if k not in ('status', 'state')}) # XXX something is fishy with the next one, rethink when sober.... if r['path'] == ds_path and r['status'] == 'ok': # this is for the entire dataset itself which was explicitly requested # make sure to save all r['type'] = 'dataset' r['process_content'] = True to_save.append({k: v for k, v in r.items() if k != 'status'}) yield r if refds_path and ds_path != refds_path and len(respath_by_status.get('success', [])): # TODO XXX we have an issue here when with `add('.')` and annex ignores any # dotfiles. In this case we end up not saving a dataset completely, because # we rely on accurate reporting. there is an issue about this already # TODO look up the issue ID # if there is a base dataset, but we are below it, and we have anything done to this # dataset -> queue dataset itself for saving its state in the parent ds_ap = dict( path=ds.path, # we have to look for the parent here, as we must save the # subdataset in the parent and not the whole subdataset itself type='dataset') parentds = get_dataset_root(normpath(opj(ds.path, pardir))) if parentds: ds_ap['parentds'] = parentds if dataset: ds_ap['refds'] = refds_path to_save.append(ds_ap) if not save: lgr.debug('Not calling `save` as instructed') return # TODO tell save what was staged already! Set 'staged=True' for # respective annotated paths that are fed into `save` # do not reuse any of the sorting done in here for saving, but instead # pass on all the annotated paths to have `save` figure out what to do with # them -- this is costs something, but should be safer, and frankly is # more comprehensible for res in Save.__call__( # hand-selected annotated paths path=to_save, dataset=refds_path, message=message if message else '[DATALAD] added content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def _install_targetpath( ds, target_path, recursive, recursion_limit, reckless, refds_path, description, jobs=None, ): """Helper to install as many subdatasets as needed to verify existence of a target path Parameters ========== ds : Dataset Locally available dataset that contains the target path target_path : Path """ # if it is an empty dir, it could still be a subdataset that is missing if (target_path.is_dir() and any(target_path.iterdir())) or \ (not target_path.is_dir() and (target_path.is_symlink() or target_path.exists())): yield dict( action='get', type='dataset', # this cannot just be the dataset path, as the original # situation of datasets avail on disk can have changed due # to subdataset installation. It has to be actual subdataset # it resides in, because this value is used to determine which # dataset to call `annex-get` on # TODO stringification is a PY35 compatibility kludge path=get_dataset_root(str(target_path)), status='notneeded', contains=[target_path], refds=refds_path, ) else: # we don't have it yet. is it in a subdataset? for res in _install_necessary_subdatasets(ds, target_path, reckless, refds_path, description=description): if (target_path.is_symlink() or target_path.exists()): # this dataset brought the path, mark for annex # processing outside res['contains'] = [target_path] # just spit it out yield res if not (target_path.is_symlink() or target_path.exists()): # looking for subdatasets did not help -> all hope is lost yield dict( action='get', path=str(target_path), status='impossible', refds=refds_path, message='path does not exist', ) return # we have the target path if not (recursive #and not recursion_limit == 'existing' \ and target_path.is_dir()): # obtain any subdatasets underneath the paths given # a non-directory cannot have content underneath return if recursion_limit == 'existing': for res in ds.subdatasets(fulfilled=True, path=target_path, recursive=recursive, recursion_limit=recursion_limit, return_type='generator'): res.update( contains=[Path(res['path'])], action='get', status='notneeded', ) yield res return lgr.info("Installing %s%s recursively", ds, (" to get %s" % target_path if ds.path != target_path else "")) for res in _recursive_install_subds_underneath( ds, # target_path was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, # TODO keep Path when RF is done start=str(target_path), refds_path=refds_path, description=description, jobs=jobs, ): # yield immediately so errors could be acted upon # outside, before we continue res.update( # do not override reported action, could be anything #action='get', contains=[Path(res['path'])], ) yield res