def test_add_files(path): ds = Dataset(path).create(force=True) test_list_1 = ['test_annex.txt'] test_list_2 = ['test.txt'] test_list_3 = ['test1.dat', 'test2.dat'] test_list_4 = [op.join('dir', 'testindir'), op.join('dir', OBSCURE_FILENAME)] for arg in [(test_list_1[0], False), (test_list_2[0], True), (test_list_3, False), (test_list_4, False)]: # special case 4: give the dir: if arg[0] == test_list_4: result = ds.save('dir', to_git=arg[1]) status = ds.repo.annexstatus(['dir']) else: result = ds.save(arg[0], to_git=arg[1]) for a in ensure_list(arg[0]): assert_result_count(result, 1, path=str(ds.pathobj / a)) status = ds.repo.get_content_annexinfo( ut.Path(p) for p in ensure_list(arg[0])) for f, p in status.items(): if arg[1]: assert p.get('key', None) is None, f else: assert p.get('key', None) is not None, f
def load_extensions(): """Load entrypoint for any configured extension package Log a warning in case a requested extension is not available, or if a requested extension fails on load. Extensions to load are taken from the 'datalad.extensions.load' configuration item. """ from datalad import cfg load_extensions = cfg.get('datalad.extensions.load', get_all=True) if load_extensions: from datalad.utils import ensure_list exts = { ename: eload for ename, _, eload in iter_entrypoints('datalad.extensions') } for el in ensure_list(load_extensions): if el not in exts: lgr.warning('Requested extension %r is not available', el) continue try: exts[el]() except Exception as e: ce = CapturedException(e) lgr.warning('Could not load extension %r: %s', el, ce)
def _fake_json_for_non_existing(paths, cmd): """Create faked JSON records for nonexisting paths provided by `paths` after running `cmd`. Internal helper for `AnnexRepo._call_annex_records`. Parameters: ----------- paths: str or list of str paths to create annex-like JSON records for, communicating that the path is unknown. cmd: str annex cmd for which to fake this result """ return [ { "command": cmd, "file": f, "note": "not found", "success": False, "error-messages": ["File unknown to git"] # Note, # that git's and annex' reporting here differs by config and # command on whether they say "does not exist" or "did not match # any file known to git". } for f in ensure_list(paths) ]
def to_str(self, include_output=True): from datalad.utils import ( ensure_unicode, ensure_list, quote_cmdlinearg, ) to_str = "{}: ".format(self.__class__.__name__) if self.cmd: to_str += "'{}'".format( # go for a compact, normal looking, properly quoted # command rendering ' '.join(quote_cmdlinearg(c) for c in ensure_list(self.cmd))) if self.code: to_str += " failed with exitcode {}".format(self.code) if self.cwd: # only if not under standard PWD to_str += " under {}".format(self.cwd) if self.msg: # typically a command error has no specific idea to_str += " [{}]".format(ensure_unicode(self.msg)) if not include_output: return to_str if self.stdout: to_str += " [out: '{}']".format( ensure_unicode(self.stdout).strip()) if self.stderr: to_str += " [err: '{}']".format( ensure_unicode(self.stderr).strip()) if self.kwargs: to_str += " [info keys: {}]".format(', '.join(self.kwargs.keys())) return to_str
def results_from_paths(paths, action=None, type=None, logger=None, refds=None, status=None, message=None): """ Helper to yield analog result dicts for each path in a sequence. Parameters ---------- message: str A result message. May contain `%s` which will be replaced by the respective `path`. Returns ------- generator """ for p in ensure_list(paths): yield get_status_dict(action, path=p, type=type, logger=logger, refds=refds, status=status, message=(message, p) if '%s' in message else message)
def close(self, allow_fail=True, ctrl_path=None): """Closes all connections, known to this instance. Parameters ---------- allow_fail: bool, optional If True, swallow exceptions which might be thrown during connection.close, and just log them at DEBUG level ctrl_path: str, Path, or list of str or Path, optional If specified, only the path(s) provided would be considered """ if self._connections: ctrl_paths = [Path(p) for p in ensure_list(ctrl_path)] to_close = [ c for c in self._connections # don't close if connection wasn't opened by SSHManager if self._connections[c].ctrl_path not in self._prev_connections and self._connections[c].ctrl_path.exists() and (not ctrl_paths or self._connections[c].ctrl_path in ctrl_paths ) ] if to_close: lgr.debug("Closing %d SSH connections..." % len(to_close)) for cnct in to_close: f = self._connections[cnct].close if allow_fail: f() else: try: f() except Exception as exc: lgr.debug("Failed to close a connection: " "%s", exc_str(exc)) self._connections = dict()
def __call__(module=None, verbose=False, nocapture=False, pdb=False, stop=False): if not module: from pkg_resources import iter_entry_points module = ['datalad'] module.extend(ep.module_name for ep in iter_entry_points('datalad.tests')) module = ensure_list(module) lgr.info('Starting test run for module(s): %s', module) # Exception (traceback) logging is disabled by default. However, as of # now we do test logging output in (too) great detail. Therefore enable # it here, so `datalad-test` doesn't fail by default. # Can be removed whenever the tests don't require it. from datalad import cfg as dlcfg from datalad.tests.utils import patch try: with patch.dict('os.environ', {'DATALAD_LOG_EXC': '1'}): dlcfg.reload() for mod in module: datalad.test(module=mod, verbose=verbose, nocapture=nocapture, pdb=pdb, stop=stop) finally: dlcfg.reload()
def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs): """Helper to drop content in datasets. Parameters ---------- ds : Dataset paths : path or list(path) which content to drop check : bool whether to instruct annex to perform minimum copy availability checks noannex_iserror : bool whether calling this function on a pure Git repo results in an 'impossible' or 'notneeded' result. **kwargs additional payload for the result dicts """ # expensive, access only once ds_repo = ds.repo if 'action' not in kwargs: kwargs['action'] = 'drop' # always need to make sure that we pass a list # `normalize_paths` decorator will otherwise screw all logic below paths = ensure_list(paths) if not hasattr(ds_repo, 'drop'): for p in paths: r = get_status_dict( status='impossible' if noannex_iserror else 'notneeded', path=p if isabs(p) else normpath(opj(ds.path, p)), message="no annex'ed content", **kwargs) r['action'] = 'drop' yield r return cmd = ['drop'] if not check: cmd.append('--force') respath_by_status = {} try: yield from (_postproc_result(res, respath_by_status, ds) for res in ds_repo._call_annex_records(cmd, files=paths)) except CommandError as e: # pick up the results captured so far and yield them # the error will be amongst them yield from (_postproc_result(res, respath_by_status, ds) for res in e.kwargs.get('stdout_json', [])) # report on things requested that annex was silent about for r in results_from_annex_noinfo( ds, paths, respath_by_status, dir_fail_msg='could not drop some content in %s %s', noinfo_dir_msg='nothing to drop from %s', noinfo_file_msg="no annex'ed content", **kwargs): r['action'] = 'drop' yield r
def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs): """Helper to drop content in datasets. Parameters ---------- ds : Dataset paths : path or list(path) which content to drop check : bool whether to instruct annex to perform minimum copy availability checks noannex_iserror : bool whether calling this function on a pure Git repo results in an 'impossible' or 'notneeded' result. **kwargs additional payload for the result dicts """ if 'action' not in kwargs: kwargs['action'] = 'drop' # always need to make sure that we pass a list # `normalize_paths` decorator will otherwise screw all logic below paths = ensure_list(paths) if not hasattr(ds.repo, 'drop'): for p in paths: r = get_status_dict( status='impossible' if noannex_iserror else 'notneeded', path=p if isabs(p) else normpath(opj(ds.path, p)), message="no annex'ed content", **kwargs) r['action'] = 'drop' yield r return opts = ['--force'] if not check else [] respath_by_status = {} for res in ds.repo.drop(paths, options=opts): res = annexjson2result( # annex reports are always about files res, ds, type='file', **kwargs) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res # report on things requested that annex was silent about for r in results_from_annex_noinfo( ds, paths, respath_by_status, dir_fail_msg='could not drop some content in %s %s', noinfo_dir_msg='nothing to drop from %s', noinfo_file_msg="no annex'ed content", **kwargs): r['action'] = 'drop' yield r
def __call__(path=None, *, dataset=None, annex=None, untracked='normal', recursive=False, recursion_limit=None, eval_subdataset_state='full', report_filetype=None): if report_filetype is not None: warnings.warn( "status(report_filetype=) no longer supported, and will be removed " "in a future release", DeprecationWarning) # To the next white knight that comes in to re-implement `status` as a # special case of `diff`. There is one fundamental difference between # the two commands: `status` can always use the worktree as evident on # disk as a constraint (e.g. to figure out which subdataset a path is # in) `diff` cannot do that (everything need to be handled based on a # "virtual" representation of a dataset hierarchy). # MIH concludes that while `status` can be implemented as a special case # of `diff` doing so would complicate and slow down both `diff` and # `status`. So while the apparent almost code-duplication between the # two commands feels wrong, the benefit is speed. Any future RF should # come with evidence that speed does not suffer, and complexity stays # on a manageable level ds = require_dataset(dataset, check_installed=True, purpose='report status') ds_path = ds.path queried = set() content_info_cache = {} for res in _yield_paths_by_ds(ds, dataset, ensure_list(path)): if 'status' in res: # this is an error yield res continue for r in yield_dataset_status( res['ds'], res['paths'], annex, untracked, recursion_limit if recursion_limit is not None else -1 if recursive else 0, queried, eval_subdataset_state, None, content_info_cache, reporting_order='depth-first'): if 'status' not in r: r['status'] = 'ok' yield dict( r, refds=ds_path, action='status', )
def check_integration1(login, keyring, path, organization=None, kwargs={}, oauthtokens=None): kwargs = kwargs.copy() if organization: kwargs['github_organization'] = organization ds = Dataset(path).create() config_patch = {} if oauthtokens: config_patch['hub.oauthtoken'] = tuple(ensure_list(oauthtokens)) # so we do not pick up local repo configuration/token repo_name = 'test_integration1' # ATM all the github goodness does not care about "this dataset" # so patch the global config with patch_config(config_patch): # everything works just nice, no conflicts etc res = ds.create_sibling_github(repo_name, **kwargs) if organization: url_fmt = 'https://{login}@github.com/{organization}/{repo_name}.git' else: url_fmt = 'https://github.com/{login}/{repo_name}.git' assert_in_results(res, path=ds.path, url=url_fmt.format(**locals()), preexisted=False) # but if we rerun - should kaboom since already has this sibling: assert_in_results( ds.create_sibling_github(repo_name, on_failure='ignore', **kwargs), message=('already has a configured sibling "%s"', 'github'), status='error', ) # but we can give it a new name, but it should kaboom since the remote one # exists already assert_in_results( ds.create_sibling_github(repo_name, name="github2", on_failure='ignore', **kwargs), message=('repository "%s" already exists on Github', 'test_integration1'), status='error', ) # we should not leave the broken sibling behind assert_not_in('github2', ds.repo.get_remotes()) # If we ask to reconfigure - should proceed normally ds.create_sibling_github(repo_name, existing='reconfigure', **kwargs)
def init(self, sanity_checks=True, init_options=None): """Initializes the Git repository. Parameters ---------- create_sanity_checks: bool, optional Whether to perform sanity checks during initialization if the target path already exists, such as that new repository is not created in the directory where git already tracks some files. init_options: list, optional Additional options to be appended to the `git-init` call. """ pathobj = self.pathobj path = str(pathobj) if not lexists(path): pathobj.mkdir(parents=True) elif sanity_checks: # Verify that we are not trying to initialize a new git repository # under a directory some files of which are already tracked by git # use case: https://github.com/datalad/datalad/issues/3068 try: stdout, _ = self._call_git( ['-C', path, 'ls-files'], expect_fail=True, read_only=True, ) if stdout: raise PathKnownToRepositoryError( "Failing to initialize new repository under %s where " "following files are known to a repository above: %s" % (path, stdout)) except CommandError: # assume that all is good -- we are not under any repo pass cmd = ['-C', path, 'init'] cmd.extend(ensure_list(init_options)) lgr.debug("Initialize empty Git repository at '%s'%s", path, ' %s' % cmd[3:] if cmd[3:] else '') stdout, stderr = self._call_git( cmd, # we don't want it to scream on stdout expect_fail=True, # there is no commit, and none will be made read_only=True) # after creation we need to reconsider .git path self.dot_git = _get_dot_git(self.pathobj, ok_missing=True) return self
def custom_result_renderer(res, **kwargs): if res['status'] != 'ok' or not res.get('action', None) == 'metadata': # logging complained about this already return # list the path, available metadata keys, and tags path = op.relpath(res['path'], res['refds']) if res.get( 'refds', None) else res['path'] meta = res.get('metadata', {}) ui.message('{path}{type}:{spacer}{meta}{tags}'.format( path=ac.color_word(path, ac.BOLD), type=' ({})'.format(ac.color_word(res['type'], ac.MAGENTA)) if 'type' in res else '', spacer=' ' if len([m for m in meta if m != 'tag']) else '', meta=','.join(k for k in sorted(meta.keys()) if k not in ('tag', '@context', '@id')) if meta else ' -' if 'metadata' in res else ' aggregated', tags='' if 'tag' not in meta else ' [{}]'.format(','.join( ensure_list(meta['tag'])))))
def __call__(module=None, verbose=False, nocapture=False, pdb=False, stop=False): if not module: from pkg_resources import iter_entry_points module = ['datalad'] module.extend(ep.module_name for ep in iter_entry_points('datalad.tests')) module = ensure_list(module) lgr.info('Starting test run for module(s): %s', module) for mod in module: datalad.test(module=mod, verbose=verbose, nocapture=nocapture, pdb=pdb, stop=stop)
def check_integration1(login, keyring, path, organization=None, kwargs={}, oauthtokens=None): kwargs = kwargs.copy() if organization: kwargs['github_organization'] = organization ds = Dataset(path).create() config_patch = {} if oauthtokens: config_patch['hub.oauthtoken'] = tuple(ensure_list(oauthtokens)) # so we do not pick up local repo configuration/token repo_name = 'test_integration1' # ATM all the github goodness does not care about "this dataset" # so patch the global config with patch_config(config_patch): # everything works just nice, no conflicts etc res = ds.create_sibling_github(repo_name, **kwargs) if organization: url_fmt = 'https://{login}@github.com/{organization}/{repo_name}.git' else: url_fmt = 'https://github.com/{login}/{repo_name}.git' eq_(res, [(ds, url_fmt.format(**locals()), False)]) # but if we rerun - should kaboom since already has this sibling: with assert_raises(ValueError) as cme: ds.create_sibling_github(repo_name, **kwargs) assert_in("already has a configured sibling", str(cme.exception)) # but we can give it a new name, but it should kaboom since the remote one # exists already with assert_raises(ValueError) as cme: ds.create_sibling_github(repo_name, name="github2", **kwargs) assert_in("already exists on", str(cme.exception)) # we should not leave the broken sibling behind assert_not_in('github2', ds.repo.get_remotes()) # If we ask to reconfigure - should proceed normally ds.create_sibling_github(repo_name, existing='reconfigure', **kwargs)
def is_result_matching_pathsource_argument(res, **kwargs): # we either have any non-zero number of "paths" (that could be anything), or # we have one path and one source # we don't do any error checking here, done by the command itself if res.get('action', None) not in ('install', 'get'): # this filter is only used in install, reject anything that comes # in that could not possibly be a 'install'-like result # e.g. a sibling being added in the process return False source = kwargs.get('source', None) if source is not None: # we want to be able to deal with Dataset instances given as 'source': if isinstance(source, Dataset): source = source.path # if there was a source, it needs to be recorded in the result # otherwise this is not what we are looking for return source == res.get('source_url', None) # the only thing left is a potentially heterogeneous list of paths/URLs paths = ensure_list(kwargs.get('path', [])) # three cases left: # 1. input arg was an absolute path -> must match 'path' property # 2. input arg was relative to a dataset -> must match refds/relpath # 3. something nifti with a relative input path that uses PWD as the # reference respath = res.get('path', None) if respath in paths: # absolute match, pretty sure we want this return True elif isinstance(kwargs.get('dataset', None), Dataset) and \ YieldRelativePaths()(res) in paths: # command was called with a reference dataset, and a relative # path of a result matches in input argument -- not 100% exhaustive # test, but could be good enough return True elif any(robust_abspath(p) == respath for p in paths): # one absolutified input path matches the result path # I'd say: got for it! return True elif any(p == res.get('source_url', None) for p in paths): # this was installed from a URL that was given, we'll take that too return True else: return False
def put(self, source, destination, recursive=False, preserve_attrs=False): """Copies source file/folder to destination on the remote. Note: this method performs escaping of filenames to an extent that moderately weird ones should work (spaces, quotes, pipes, other characters with special shell meaning), but more complicated cases might require appropriate external preprocessing of filenames. Parameters ---------- source : str or list file/folder path(s) to copy from on local destination : str file/folder path to copy to on remote recursive : bool flag to enable recursive copying of given sources preserve_attrs : bool preserve modification times, access times, and modes from the original file Returns ------- str stdout, stderr of the copy operation. """ # make sure we have an open connection, will test if action is needed # by itself self.open() scp_cmd = self._get_scp_command_spec(recursive, preserve_attrs) # add source filepath(s) to scp command scp_cmd += ensure_list(source) # add destination path scp_cmd += [ '%s:%s' % ( self.sshri.hostname, _quote_filename_for_scp(destination), ) ] out = self.runner.run(scp_cmd, protocol=StdOutErrCapture) return out['stdout'], out['stderr']
def _wrap_cached_dataset(*arg, **kw): if DATALAD_TESTS_CACHE: # Note: We can't pass keys based on `paths` parameter to # get_cached_dataset yet, since translation to keys depends on a # worktree. We'll have the worktree of `version` only after cloning. ds = get_cached_dataset(url, version=version) clone_ds = Clone()(ds.pathobj, arg[-1]) else: clone_ds = Clone()(url, arg[-1]) #save some cycles clone_repo = clone_ds.repo if version: clone_repo.checkout(version) if paths and AnnexRepo.is_valid_repo(clone_ds.path): # just assume ds is annex as well. Otherwise `Clone` wouldn't # work correctly - we don't need to test its implementation here if DATALAD_TESTS_CACHE: # cache is enabled; we need to make sure it has the desired # content, so clone_ds can get it from there. However, we got # `paths` and potentially a `version` they refer to. We can't # assume the same (or any) worktree in cache. Hence we need to # translate to keys. # MIH Despite the variable names used in this function # (pathS, keyS) they ultimately are passed to get(..., key=True) # which means that it can ever only be a single path and a # single key -- this is very confusing. # the key determination could hence be done with # get_file_annexinfo() in a much simpler way, but it seems this # function wants to be ready for more, sigh keys = [ p['key'] for p in clone_repo.get_content_annexinfo( ensure_list(paths), init=None).values() if 'key' in p ] if keys: ds.repo.get(keys, key=True) clone_repo.fsck(remote=DEFAULT_REMOTE, fast=True) clone_ds.get(paths) return f(*(arg[:-1] + (clone_ds, )), **kw)
def add(self, var, value, scope='branch', reload=True): """Add a configuration variable and value Parameters ---------- var : str Variable name including any section like `git config` expects them, e.g. 'core.editor' value : str Variable value %s""" if scope == 'override': from datalad.utils import ensure_list val = ensure_list(self.overrides.pop(var, None)) val.append(value) self.overrides[var] = val[0] if len(val) == 1 else val if reload: self.reload(force=True) return self._run(['--add', var, value], scope=scope, reload=reload, protocol=StdOutErrCapture)
def __call__(sshurl, name=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, recursion_limit=None, existing='error', shared=None, group=None, ui=False, as_common_datasrc=None, publish_by_default=None, publish_depends=None, annex_wanted=None, annex_group=None, annex_groupwanted=None, inherit=False, since=None): # # nothing without a base dataset # ds = require_dataset(dataset, check_installed=True, purpose='creating a sibling') refds_path = ds.path # # all checks that are possible before we start parsing the dataset # # possibly use sshurl to get the name in case if not specified if not sshurl: if not inherit: raise InsufficientArgumentsError( "needs at least an SSH URL, if no inherit option") if name is None: raise ValueError( "Neither SSH URL, nor the name of sibling to inherit from " "was specified") # It might well be that we already have this remote setup try: sshurl = CreateSibling._get_remote_url(ds, name) except Exception as exc: lgr.debug('%s does not know about url for %s: %s', ds, name, exc_str(exc)) elif inherit: raise ValueError( "For now, for clarity not allowing specifying a custom sshurl " "while inheriting settings") # may be could be safely dropped -- still WiP if not sshurl: # TODO: may be more back up before _prep? super_ds = ds.get_superdataset() if not super_ds: raise ValueError( "Could not determine super dataset for %s to inherit URL" % ds) super_url = CreateSibling._get_remote_url(super_ds, name) # for now assuming hierarchical setup # (TODO: to be able to destinguish between the two, probably # needs storing datalad.*.target_dir to have %RELNAME in there) sshurl = slash_join(super_url, relpath(refds_path, super_ds.path)) # check the login URL sibling_ri = RI(sshurl) ssh_sibling = is_ssh(sibling_ri) if not (ssh_sibling or isinstance(sibling_ri, PathRI)): raise ValueError( "Unsupported SSH URL or path: '{0}', " "use ssh://host/path, host:path or path syntax".format(sshurl)) if not name: name = sibling_ri.hostname if ssh_sibling else "local" lgr.debug("No sibling name given. Using %s'%s' as sibling name", "URL hostname " if ssh_sibling else "", name) if since == '': # consider creating siblings only since the point of # the last update # XXX here we assume one to one mapping of names from local branches # to the remote active_branch = ds.repo.get_active_branch() since = '%s/%s' % (name, active_branch) # # parse the base dataset to find all subdatasets that need processing # to_process = [] cand_ds = [ Dataset(r['path']) for r in diff_dataset( ds, fr=since, to=None, # make explicit, but doesn't matter, no recursion in diff() constant_refs=True, # contrain to the paths of all locally existing subdatasets path=[ sds['path'] for sds in ds.subdatasets(recursive=recursive, recursion_limit=recursion_limit, fulfilled=True, result_renderer=None) ], # save cycles, we are only looking for datasets annex=None, untracked='no', # recursion was done faster by subdatasets() recursive=False, # save cycles, we are only looking for datasets eval_file_type=False, ) if r.get('type') == 'dataset' and r.get('state', None) != 'clean' ] # check remotes setup for d in cand_ds if since else ([ds] + cand_ds): d_repo = d.repo if d_repo is None: continue checkds_remotes = d.repo.get_remotes() res = dict( action='create_sibling', path=d.path, type='dataset', ) if publish_depends: # make sure dependencies are valid # TODO: inherit -- we might want to automagically create # those dependents as well??? unknown_deps = set( ensure_list(publish_depends)).difference(checkds_remotes) if unknown_deps: yield dict( res, status='error', message=('unknown sibling(s) specified as publication ' 'dependency: %s', unknown_deps), ) continue if name in checkds_remotes and existing in ('error', 'skip'): yield dict( res, status='error' if existing == 'error' else 'notneeded', message=( "sibling '%s' already configured (specify alternative " "name, or force reconfiguration via --existing", name), ) continue to_process.append(res) if not to_process: # we ruled out all possibilities # TODO wait for gh-1218 and make better return values lgr.info("No datasets qualify for sibling creation. " "Consider different settings for --existing " "or --since if this is unexpected") return if ssh_sibling: # request ssh connection: lgr.info("Connecting ...") shell = ssh_manager.get_connection(sshurl) else: shell = _RunnerAdapter() sibling_ri.path = str(resolve_path(sibling_ri.path, dataset)) if target_dir: target_dir = opj(sibling_ri.path, target_dir) if target_dir is None: if sibling_ri.path: target_dir = sibling_ri.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = "%RELNAME" not in target_dir if not shell.get_annex_version(): raise MissingExternalDependency( 'git-annex', msg="It's required on the {} machine to create a sibling". format('remote' if ssh_sibling else 'local')) # # all checks done and we have a connection, now do something # # loop over all datasets, ordered from top to bottom to make test # below valid (existing directories would cause the machinery to halt) # But we need to run post-update hook in depth-first fashion, so # would only collect first and then run (see gh #790) yielded = set() remote_repos_to_run_hook_for = [] for currentds_ap in \ sorted(to_process, key=lambda x: x['path'].count('/')): current_ds = Dataset(currentds_ap['path']) path = _create_dataset_sibling( name, current_ds, refds_path, shell, replicate_local_structure, sibling_ri, target_dir, target_url, target_pushurl, existing, shared, group, publish_depends, publish_by_default, ui, as_common_datasrc, annex_wanted, annex_group, annex_groupwanted, inherit) if not path: # nothing new was created # TODO is 'notneeded' appropriate in this case? currentds_ap['status'] = 'notneeded' # TODO explain status in 'message' yield currentds_ap yielded.add(currentds_ap['path']) continue remote_repos_to_run_hook_for.append((path, currentds_ap)) # publish web-interface to root dataset on publication server if current_ds.path == refds_path and ui: lgr.info("Uploading web interface to %s" % path) try: CreateSibling.upload_web_interface(path, shell, shared, ui) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to push web interface to the remote datalad repository (%s)", exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue # in reverse order would be depth first lgr.info("Running post-update hooks in all created siblings") # TODO: add progressbar for path, currentds_ap in remote_repos_to_run_hook_for[::-1]: # Trigger the hook lgr.debug("Running hook for %s (if exists and executable)", path) try: shell( "cd {} " "&& ( [ -x hooks/post-update ] && hooks/post-update || : )" "".format(sh_quote(_path_(path, ".git")))) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to run post-update hook under remote path %s (%s)", path, exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue if not currentds_ap['path'] in yielded: # if we were silent until now everything is just splendid currentds_ap['status'] = 'ok' yield currentds_ap
def __call__(dataset, filename='README.md', existing='skip'): from os.path import lexists from os.path import join as opj from io import open import logging lgr = logging.getLogger('datalad.plugin.add_readme') from datalad.distribution.dataset import require_dataset from datalad.utils import ensure_list dataset = require_dataset(dataset, check_installed=True, purpose='add README') filename = opj(dataset.path, filename) res_kwargs = dict(action='add_readme', path=filename) if lexists(filename) and existing == 'skip': yield dict( res_kwargs, status='notneeded', message='file already exists, and not appending content') return # unlock, file could be annexed if lexists(filename): dataset.unlock(filename) # get any metadata on the dataset itself dsinfo = dataset.metadata( '.', reporton='datasets', return_type='item-or-list', on_failure='ignore') meta = {} if not isinstance(dsinfo, dict) or dsinfo.get('status', None) != 'ok': lgr.warning("Could not obtain dataset metadata, proceeding without") dsinfo = {} else: # flatten possibly existing multiple metadata sources for src in dsinfo['metadata']: if src.startswith('@'): # not a source continue meta.update(dsinfo['metadata'][src]) metainfo = '' for label, content in ( ('', meta.get('description', meta.get('shortdescription', ''))), ('Author{}'.format('s' if isinstance(meta.get('author', None), list) else ''), u'\n'.join([u'- {}'.format(a) for a in ensure_list(meta.get('author', []))])), ('Homepage', meta.get('homepage', '')), ('Reference', meta.get('citation', '')), ('License', meta.get('license', '')), ('Keywords', u', '.join([u'`{}`'.format(k) for k in ensure_list(meta.get('tag', []))])), ('Funding', meta.get('fundedby', '')), ): if label and content: metainfo += u'\n\n### {}\n\n{}'.format(label, content) elif content: metainfo += u'\n\n{}'.format(content) for key in 'title', 'name', 'shortdescription': if 'title' in meta: break if key in meta: meta['title'] = meta[key] default_content=u"""\ # {title}{metainfo} ## General information This is a DataLad dataset{id}. For more information on DataLad and on how to work with its datasets, see the DataLad documentation at: http://handbook.datalad.org """.format( title='Dataset "{}"'.format(meta['title']) if 'title' in meta else 'About this dataset', metainfo=metainfo, id=u' (id: {})'.format(dataset.id) if dataset.id else '', ) with open(filename, 'a' if existing == 'append' else 'w', encoding='utf-8') as fp: fp.write(default_content) yield dict( status='ok', path=filename, type='file', action='add_readme') for r in dataset.save( filename, message='[DATALAD] added README', result_filter=None, result_xfm=None): yield r
def __call__( path=None, *, dataset=None, recursive=False, recursion_limit=None, update_mode='target', incremental=False, force_extraction=False, save=True): refds_path = require_dataset(dataset) # it really doesn't work without a dataset ds = require_dataset( dataset, check_installed=True, purpose='metadata aggregation') path = ensure_list(path) if not path: # then current/reference dataset is "aggregated" # We should not add ds.path always since then --recursive would # also recurse current even if paths are given path.append(ds.path) agginfo_db_location, agg_base_path = get_ds_aggregate_db_locations( ds, # do not warn here, next call triggers the same warning warn_absent=False) agginfo_db = load_ds_aggregate_db(ds, abspath=True) to_save = [] to_aggregate = set() paths_by_ds, errors = get_paths_by_ds( require_dataset(dataset), dataset, paths=ensure_list(path), subdsroot_mode='super') for ap in _minimal_annotate_paths( paths_by_ds, errors, action='aggregate_metadata', recursive=recursive, recursion_limit=recursion_limit): if ap.get('status', None): # this is done yield ap continue ap_type = ap.get('type', None) ap_state = ap.get('state', None) assert('parentds' in ap or ap_type == 'dataset') if ap_type == 'dataset' and ap_state != 'absent': # a present dataset, we can take directly from it aggsrc = ap['path'] lgr.info('Aggregate metadata for dataset %s', aggsrc) else: # everything else needs to come from the parent aggsrc = ap['parentds'] if ap_state == 'absent': lgr.info( 'Attempt to use pre-aggregate metadata for absent %s from dataset at %s', ap['path'], aggsrc) else: lgr.info( 'Aggregate metadata for %s from dataset at %s', ap['path'], aggsrc) to_aggregate.add(aggsrc) if ap_state == 'absent': # key thought: recursive is done by path annotation, hence # once we hit an absent dataset, we are 100% certain that # there is nothing to recurse into on the file system # hence we only have to look into the aggregated metadata # of the last available dataset in the dataset tree edge # # if there is nothing at this path, we need to look into the # parentds and check if we know anything about this path # if we do, we need to grab all the info and objects # if not, we need to error res = _get_dsinfo_from_aggmetadata( aggsrc, ap['path'], recursive, agginfo_db) if not isinstance(res, list): yield get_status_dict( status='impossible', message=res, action='aggregate_metadata', path=ap['path'], logger=lgr) continue # cue for aggregation to_aggregate.update(res) else: # actually aggregate metadata for this dataset, immediately place # generated objects into the aggregated or reference dataset, # and put info into DB to get the distributed to all datasets # that need to be updated errored = _dump_extracted_metadata( ds, Dataset(aggsrc), agginfo_db, to_save, force_extraction, agg_base_path) if errored: yield get_status_dict( status='error', message='Metadata extraction failed (see previous error message, set datalad.runtime.raiseonerror=yes to fail immediately)', action='aggregate_metadata', path=aggsrc, logger=lgr) # at this point we have dumped all aggregated metadata into object files # somewhere, we know what needs saving, but having saved anything, and # we know about the states of all aggregated dataset in the DB # what remains to do is to update all dataset, so they have there own copy # of aggregated metadata and update their respective aggregate.json with # info on what states we just aggregated from # first, let's figure out what dataset need updating at all # get adjencency info of the dataset tree spanning the base to all leaf dataset # associated with the path arguments if update_mode == 'all': ds_adj = {} discover_dataset_trace_to_targets( ds.path, to_aggregate, [], ds_adj, # we know that to_aggregate only lists datasets, existing and # absent ones -- we want to aggregate all of them, either from # just extracted metadata, or from previously aggregated metadata # of the closest superdataset includeds=to_aggregate) # TODO we need to work in the info about dataset that we only got from # aggregated metadata, that had no trace on the file system in here!! subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate) elif update_mode == 'target': subtrees = {ds.path: list(agginfo_db.keys())} else: raise ValueError( "unknown `update_mode` '%s' for metadata aggregation", update_mode) # go over datasets in bottom-up fashion for parentds_path in sorted(subtrees, reverse=True): lgr.info('Update aggregate metadata in dataset at: %s', parentds_path) _update_ds_agginfo( ds.path, parentds_path, subtrees[parentds_path], incremental, agginfo_db, to_save) # update complete res = get_status_dict( status='ok', action='aggregate_metadata', path=parentds_path, type='dataset', logger=lgr) res.update(agginfo_db.get(parentds_path, {})) yield res # # save potential modifications to dataset global metadata # if not to_save: return lgr.info('Attempting to save %i files/datasets', len(to_save)) for res in Save.__call__( # save does not need any pre-annotated path hints path=[r['path'] for r in to_save], dataset=refds_path, message='[DATALAD] Dataset aggregate metadata update', return_type='generator', result_renderer='disabled', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def _extract_metadata(agginto_ds, aggfrom_ds, db, to_save, objid, metasources, refcommit, subds_relpaths, agg_base_path): lgr.debug('Performing metadata extraction from %s', aggfrom_ds) # we will replace any conflicting info on this dataset with fresh stuff agginfo = db.get(aggfrom_ds.path, {}) # paths to extract from relevant_paths = sorted(_get_metadatarelevant_paths(aggfrom_ds, subds_relpaths)) # get extractors to engage from source dataset nativetypes = ['datalad_core', 'annex'] + ensure_list(get_metadata_type(aggfrom_ds)) # store essential extraction config in dataset record agginfo['extractors'] = nativetypes agginfo['datalad_version'] = datalad.__version__ # perform the actual extraction dsmeta, contentmeta, errored = _get_metadata( aggfrom_ds, nativetypes, # None indicates to honor a datasets per-extractor configuration and to be # on by default global_meta=None, content_meta=None, paths=relevant_paths) meta = { 'ds': dsmeta, 'cn': (dict(contentmeta[k], path=k) for k in sorted(contentmeta)) } # inject the info which commit we are describing into the core metadata # this is done here in order to avoid feeding it all the way down coremeta = dsmeta.get('datalad_core', {}) version = aggfrom_ds.repo.describe(commitish=refcommit) if version: coremeta['version'] = version coremeta['refcommit'] = refcommit dsmeta['datalad_core'] = coremeta # for both types of metadata for label, props in metasources.items(): dest = props['targetds'] if not meta[label]: continue # only write to disk if there is something objrelpath = _get_obj_location(objid, label, props['dumper']) # place metadata object into the source dataset objpath = op.join(dest.path, agg_base_path, objrelpath) # write obj files if op.exists(objpath): dest.unlock(objpath) elif op.lexists(objpath): # if it gets here, we have a symlink that is pointing nowhere # kill it, to be replaced with the newly aggregated content dest.repo.remove(objpath) # TODO actually dump a compressed file when annexing is possible # to speed up on-demand access props['dumper'](meta[label], objpath) # stage for dataset.save() to_save.append(dict(path=objpath, type='file')) # important to use abspath here, needs to be rewritten relative to # all receiving datasets agginfo['{}_info'.format(props['type'])] = objpath # overwrite existing info with stuff from just finished extraction db[aggfrom_ds.path] = agginfo return errored
def __call__(path=None, initopts=None, *, force=False, description=None, dataset=None, annex=True, fake_dates=False, cfg_proc=None): # we only perform negative tests below no_annex = not annex if dataset: if isinstance(dataset, Dataset): ds = dataset else: ds = Dataset(dataset) refds_path = ds.path else: ds = refds_path = None # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD # sanity check first if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if (isinstance(initopts, (list, tuple)) and '--bare' in initopts) or (isinstance(initopts, dict) and 'bare' in initopts): raise ValueError( "Creation of bare repositories is not supported. Consider " "one of the create-sibling commands, or use " "Git to init a bare repository and push an existing dataset " "into it.") if path: path = resolve_path(path, dataset) path = path if path \ else getpwd() if ds is None \ else refds_path # we know that we need to create a dataset at `path` assert (path is not None) # assure cfg_proc is a list (relevant if used via Python API) cfg_proc = ensure_list(cfg_proc) # prep for yield res = dict(action='create', path=str(path), logger=lgr, type='dataset', refds=refds_path) refds = None if refds_path and refds_path != str(path): refds = require_dataset(refds_path, check_installed=True, purpose='create a subdataset') path_inrefds = path_under_rev_dataset(refds, path) if path_inrefds is None: yield dict( res, status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", ds, str(path)), ) return # try to locate an immediate parent dataset # we want to know this (irrespective of whether we plan on adding # this new dataset to a parent) in order to avoid conflicts with # a potentially absent/uninstalled subdataset of the parent # in this location # it will cost some filesystem traversal though... parentds_path = get_dataset_root( op.normpath(op.join(str(path), os.pardir))) if parentds_path: prepo = GitRepo(parentds_path) parentds_path = Path(parentds_path) # we cannot get away with a simple # GitRepo.get_content_info(), as we need to detect # uninstalled/added subdatasets too check_path = Path(path) pstatus = prepo.status( untracked='no', # limit query to target path for a potentially massive speed-up paths=[check_path.relative_to(parentds_path)]) if (not pstatus.get(check_path, {}).get("type") == "dataset" and any(check_path == p or check_path in p.parents for p in pstatus)): # redo the check in a slower fashion, it is already broken # let's take our time for a proper error message conflict = [ p for p in pstatus if check_path == p or check_path in p.parents ] res.update({ 'status': 'error', 'message': ('collision with content in parent dataset at %s: %s', str(parentds_path), [str(c) for c in conflict]) }) yield res return if not force: # another set of check to see whether the target path is pointing # into a known subdataset that is not around ATM subds_status = { parentds_path / k.relative_to(prepo.path) for k, v in pstatus.items() if v.get('type', None) == 'dataset' } check_paths = [check_path] check_paths.extend(check_path.parents) if any(p in subds_status for p in check_paths): conflict = [p for p in check_paths if p in subds_status] res.update({ 'status': 'error', 'message': ('collision with %s (dataset) in dataset %s', str(conflict[0]), str(parentds_path)) }) yield res return # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = ds if isinstance(ds, Dataset) and \ ds.path == path else Dataset(str(path)) # don't create in non-empty directory without `force`: if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force: res.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`--force` option to ignore' }) yield res return # Check if specified cfg_proc(s) can be discovered, storing # the results so they can be used when the time comes to run # the procedure. If a procedure cannot be found, raise an # error to prevent creating the dataset. cfg_proc_specs = [] if cfg_proc: discovered_procs = tbds.run_procedure( discover=True, result_renderer='disabled', return_type='generator', ) for cfg_proc_ in cfg_proc: for discovered_proc in discovered_procs: if discovered_proc['procedure_name'] == 'cfg_' + cfg_proc_: cfg_proc_specs.append(discovered_proc) break else: raise ValueError("Cannot find procedure with name " "'%s'" % cfg_proc_) if initopts is not None and isinstance(initopts, list): initopts = {'_from_cmdline_': initopts} # Note for the code below: # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad # Re-use tbrepo instance, do not use tbds.repo # create and configure desired repository # also provides initial set of content to be tracked with git (not annex) if no_annex: tbrepo, add_to_git = _setup_git_repo(path, initopts, fake_dates) else: tbrepo, add_to_git = _setup_annex_repo(path, initopts, fake_dates, description) # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad # Note, must not happen earlier (before if) since "smart" it would not be tbds_config = tbds.config # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' # Note, that Dataset property `id` will change when we unset the # respective config. Therefore store it before: tbds_id = tbds.id if id_var in tbds_config: # make sure we reset this variable completely, in case of a # re-create tbds_config.unset(id_var, scope='branch') if _seed is None: # just the standard way # use a fully random identifier (i.e. UUID version 4) uuid_id = str(uuid.uuid4()) else: # Let's generate preseeded ones uuid_id = str(uuid.UUID(int=random.getrandbits(128))) tbds_config.add(id_var, tbds_id if tbds_id is not None else uuid_id, scope='branch', reload=False) # make config overrides permanent in the repo config # this is similar to what `annex init` does # we are only doing this for config overrides and do not expose # a dedicated argument, because it is sufficient for the cmdline # and unnecessary for the Python API (there could simply be a # subsequence ds.config.add() call) for k, v in tbds_config.overrides.items(): tbds_config.add(k, v, scope='local', reload=False) # all config manipulation is done -> fll reload tbds_config.reload() # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbrepo.pathobj / '.datalad'] = { 'type': 'directory', 'state': 'untracked' } # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbrepo.save( message='[DATALAD] new dataset', git=True, # we have to supply our own custom status, as the repo does # not have a single commit yet and the is no HEAD reference # TODO make `GitRepo.status()` robust to this state. _status=add_to_git, ) for cfg_proc_spec in cfg_proc_specs: yield from tbds.run_procedure( cfg_proc_spec, result_renderer='disabled', return_type='generator', ) # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if isinstance(refds, Dataset) and refds.path != tbds.path: # we created a dataset in another dataset # -> make submodule yield from refds.save( path=tbds.path, return_type='generator', result_renderer='disabled', ) res.update({'status': 'ok'}) yield res
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None): refds = require_dataset(dataset, check_installed=True, purpose="unlock") # Before passing the results to status() # * record explicitly specified non-directory paths so that we can # decide whether to yield a result for reported paths # * filter out and yield results for paths that don't exist res_paths_nondir = set() paths_lexist = None res_paths = list() if path: # Note, that we need unresolved versions of the path input to be # passed on to status. See gh-5456 for example. path = ensure_list(path) res_paths = resolve_path(path, ds=dataset) paths_lexist = [] res_paths_lexist = [] for p, p_r in zip(path, res_paths): if p_r.exists() or p_r.is_symlink(): paths_lexist.append(p) res_paths_lexist.append(p_r) if not p_r.is_dir(): res_paths_nondir.add(p_r) res_kwargs = dict(action='unlock', logger=lgr, refds=refds.path) if res_paths: for p in set(res_paths).difference(set(res_paths_lexist)): yield get_status_dict( status="impossible", path=str(p), type="file", message="path does not exist", **res_kwargs) if not (paths_lexist or paths_lexist is None): return # Collect information on the paths to unlock. to_unlock = defaultdict(list) # ds => paths (relative to ds) for res in Status()( # ATTN: it is vital to pass the `dataset` argument as it, # and not a dataset instance in order to maintain the path # semantics between here and the status() call dataset=dataset, path=paths_lexist, untracked="normal" if res_paths_nondir else "no", report_filetype=False, annex="availability", recursive=recursive, recursion_limit=recursion_limit, result_renderer='disabled', on_failure="ignore"): if res["action"] != "status" or res["status"] != "ok": yield res continue has_content = res.get("has_content") if has_content: parentds = res["parentds"] to_unlock[parentds].append(op.relpath(res["path"], parentds)) elif res_paths_nondir and Path(res["path"]) in res_paths_nondir: if has_content is False: msg = "no content present" status = "impossible" elif res["state"] == "untracked": msg = "untracked" status = "impossible" else: # This is either a regular git file or an unlocked annex # file. msg = "non-annex file" status = "notneeded" yield get_status_dict( status=status, path=res["path"], type="file", message="{}; cannot unlock".format(msg), **res_kwargs) # Do the actual unlocking. for ds_path, files in to_unlock.items(): ds = Dataset(ds_path) for r in ds.repo._call_annex_records( ["unlock"], files=files): yield get_status_dict( path=op.join(ds.path, r['file']), status='ok' if r['success'] else 'error', type='file', **res_kwargs)
def __call__(dataset, filename='README.md', existing='skip'): from os.path import lexists from os.path import join as opj from io import open import logging lgr = logging.getLogger('datalad.local.add_readme') from datalad.distribution.dataset import require_dataset from datalad.utils import ensure_list dataset = require_dataset(dataset, check_installed=True, purpose='add README') fpath = opj(dataset.path, filename) res_kwargs = dict(action='add_readme', path=fpath) if lexists(fpath) and existing == 'skip': yield dict( res_kwargs, status='notneeded', message='file already exists, and not appending content') return # unlock, file could be annexed if lexists(fpath): dataset.unlock(fpath) if not lexists(fpath): # if we have an annex repo, shall the README go to Git or annex? if isinstance(dataset.repo, AnnexRepo) \ and 'annex.largefiles' not in \ dataset.repo.get_gitattributes(filename).get(filename, {}): # configure the README to go into Git dataset.repo.set_gitattributes( [(filename, {'annex.largefiles': 'nothing'})]) dataset.save( path='.gitattributes', message="[DATALAD] Configure README to be in Git", to_git=True ) # get any metadata on the dataset itself dsinfo = dataset.metadata( '.', reporton='datasets', return_type='item-or-list', on_failure='ignore') meta = {} if not isinstance(dsinfo, dict) or dsinfo.get('status', None) != 'ok': lgr.warning("Could not obtain dataset metadata, proceeding without") dsinfo = {} else: # flatten possibly existing multiple metadata sources for src in dsinfo['metadata']: if src.startswith('@'): # not a source continue meta.update(dsinfo['metadata'][src]) metainfo = '' for label, content in ( ('', meta.get('description', meta.get('shortdescription', ''))), ('Author{}'.format('s' if isinstance(meta.get('author', None), list) else ''), u'\n'.join([u'- {}'.format(a) for a in ensure_list(meta.get('author', []))])), ('Homepage', meta.get('homepage', '')), ('Reference', meta.get('citation', '')), ('License', meta.get('license', '')), ('Keywords', u', '.join([u'`{}`'.format(k) for k in ensure_list(meta.get('tag', []))])), ('Funding', meta.get('fundedby', '')), ): if label and content: metainfo += u'\n\n### {}\n\n{}'.format(label, content) elif content: metainfo += u'\n\n{}'.format(content) for key in 'title', 'name', 'shortdescription': if 'title' in meta: break if key in meta: meta['title'] = meta[key] default_content=u"""\ # {title}{metainfo} ## General information This is a DataLad dataset{id}. ## DataLad datasets and how to use them This repository is a [DataLad](https://www.datalad.org/) dataset. It provides fine-grained data access down to the level of individual files, and allows for tracking future updates. In order to use this repository for data retrieval, [DataLad](https://www.datalad.org/) is required. It is a free and open source command line tool, available for all major operating systems, and builds up on Git and [git-annex](https://git-annex.branchable.com/) to allow sharing, synchronizing, and version controlling collections of large files. More information on how to install DataLad and [how to install](http://handbook.datalad.org/en/latest/intro/installation.html) it can be found in the [DataLad Handbook](https://handbook.datalad.org/en/latest/index.html). ### Get the dataset A DataLad dataset can be `cloned` by running ``` datalad clone <url> ``` Once a dataset is cloned, it is a light-weight directory on your local machine. At this point, it contains only small metadata and information on the identity of the files in the dataset, but not actual *content* of the (sometimes large) data files. ### Retrieve dataset content After cloning a dataset, you can retrieve file contents by running ``` datalad get <path/to/directory/or/file> ``` This command will trigger a download of the files, directories, or subdatasets you have specified. DataLad datasets can contain other datasets, so called *subdatasets*. If you clone the top-level dataset, subdatasets do not yet contain metadata and information on the identity of files, but appear to be empty directories. In order to retrieve file availability metadata in subdatasets, run ``` datalad get -n <path/to/subdataset> ``` Afterwards, you can browse the retrieved metadata to find out about subdataset contents, and retrieve individual files with `datalad get`. If you use `datalad get <path/to/subdataset>`, all contents of the subdataset will be downloaded at once. ### Stay up-to-date DataLad datasets can be updated. The command `datalad update` will *fetch* updates and store them on a different branch (by default `remotes/origin/master`). Running ``` datalad update --merge ``` will *pull* available updates and integrate them in one go. ### Find out what has been done DataLad datasets contain their history in the ``git log``. By running ``git log`` (or a tool that displays Git history) in the dataset or on specific files, you can find out what has been done to the dataset or to individual files by whom, and when. """.format( title='Dataset "{}"'.format(meta['title']) if 'title' in meta else 'About this dataset', metainfo=metainfo, id=u' (id: {})'.format(dataset.id) if dataset.id else '', ) with open(fpath, 'a' if existing == 'append' else 'w', encoding='utf-8') as fp: fp.write(default_content) yield dict( status='ok', path=fpath, type='file', action='add_readme') for r in dataset.save( fpath, message='[DATALAD] added README', result_filter=None, result_xfm=None): yield r
def __call__( path=None, message=None, dataset=None, version_tag=None, recursive=False, recursion_limit=None, updated=False, message_file=None, to_git=None, jobs=None, amend=False, ): if message and message_file: raise ValueError( "Both a message and message file were specified for save()") if amend and recursive: raise ValueError("Cannot amend a commit recursively.") path = ensure_list(path) if message_file: with open(message_file) as mfh: message = mfh.read() # we want 'normal' to achieve the most compact argument list # for git calls # untracked_mode = 'no' if updated else 'normal' # TODO however, Repo.add() would refuse to add any dotfiles # in a directory that is itself untracked, hence the only # choice is to go with potentially crazy long lists # until https://github.com/datalad/datalad/issues/1454 # has a resolution untracked_mode = 'no' if updated else 'all' # there are three basic scenarios: # 1. save modifications to any already tracked content # 2. save any content (including removal of deleted content) # to bring things to a clean state # 3. like (2), but only operate on a given subset of content # identified by paths # - all three have to work in conjunction with --recursive # - the difference between (1) and (2) should be no more # that a switch from --untracked=no to --untracked=all # in Repo.save() # we do not support # - simultaneous operations on multiple datasets from disjoint # dataset hierarchies, hence a single reference dataset must be # identifiable from the either # - curdir or # - the `dataset` argument. # This avoids complex annotation loops and hierarchy tracking. # - any modification upwards from the root dataset ds = require_dataset(dataset, check_installed=True, purpose='saving') # use status() to do all discovery and annotation of paths paths_by_ds = {} for s in Status()( # ATTN: it is vital to pass the `dataset` argument as it, # and not a dataset instance in order to maintain the path # semantics between here and the status() call dataset=dataset, path=path, untracked=untracked_mode, report_filetype=False, recursive=recursive, recursion_limit=recursion_limit, on_failure='ignore', # for save without recursion only commit matters eval_subdataset_state='full' if recursive else 'commit', result_renderer='disabled'): if s['status'] == 'error': # Downstream code can't do anything with these. Let the caller # decide their fate. yield s continue # fish out status dict for this parent dataset ds_status = paths_by_ds.get(s['parentds'], {}) # reassemble path status info as repo.status() would have made it ds_status[ut.Path(s['path'])] = \ {k: v for k, v in s.items() if k not in ( 'path', 'parentds', 'refds', 'status', 'action', 'logger')} paths_by_ds[s['parentds']] = ds_status lgr.debug('Determined %i datasets for saving from input arguments', len(paths_by_ds)) # figure out what datasets to process, start with the ones containing # the paths that were given as arguments discovered_datasets = list(paths_by_ds.keys()) if dataset: # if a reference dataset was given we want to save all the way up # to it, so let's throw it into the mix discovered_datasets.append(ds.path) # sort the datasets into (potentially) disjoint hierarchies, # or a single one, if a reference dataset was given dataset_hierarchies = get_tree_roots(discovered_datasets) for rootds, children in dataset_hierarchies.items(): edges = {} discover_dataset_trace_to_targets(rootds, children, [], edges, includeds=children) for superds, subdss in edges.items(): superds_status = paths_by_ds.get(superds, {}) for subds in subdss: subds_path = ut.Path(subds) sub_status = superds_status.get(subds_path, {}) if not (sub_status.get("state") == "clean" and sub_status.get("type") == "dataset"): # TODO actually start from an entry that may already # exist in the status record superds_status[subds_path] = dict( # shot from the hip, some status config # to trigger this specific super/sub # relation to be saved state='untracked', type='dataset') paths_by_ds[superds] = superds_status def save_ds(args, version_tag=None): pdspath, paths = args pds = Dataset(pdspath) pds_repo = pds.repo # pop status for this dataset, we are not coming back to it pds_status = { # for handing over to the low-level code, we recode any # path relative to the real repo location, this avoid # cumbersome symlink handling without context in the # lower levels pds_repo.pathobj / p.relative_to(pdspath): props for p, props in paths.items() } start_commit = pds_repo.get_hexsha() if not all(p['state'] == 'clean' for p in pds_status.values()) or \ (amend and message): for res in pds_repo.save_( message=message, # make sure to have the `path` arg be None, as we want # to prevent and bypass any additional repo.status() # calls paths=None, # prevent whining of GitRepo git=True if not hasattr(ds.repo, 'annexstatus') else to_git, # we are supplying the full status already, do not # detect anything else untracked='no', _status=pds_status, amend=amend): # TODO remove stringification when datalad-core can handle # path objects, or when PY3.6 is the lowest supported # version for k in ('path', 'refds'): if k in res: res[k] = str( # recode path back to dataset path anchor pds.pathobj / res[k].relative_to(pds_repo.pathobj)) yield res # report on the dataset itself dsres = dict( action='save', type='dataset', path=pds.path, refds=ds.path, status='ok' if start_commit != pds_repo.get_hexsha() else 'notneeded', logger=lgr, ) if not version_tag: yield dsres return try: # method requires str version_tag = str(version_tag) pds_repo.tag(version_tag) dsres.update(status='ok', version_tag=version_tag) yield dsres except CommandError as e: if dsres['status'] == 'ok': # first we yield the result for the actual save # TODO: we will get duplicate dataset/save record obscuring # progress reporting. yoh thought to decouple "tag" from "save" # messages but was worrying that original authors would disagree yield dsres.copy() # and now complain that tagging didn't work dsres.update(status='error', message=('cannot tag this version: %s', e.stderr.strip())) yield dsres if not paths_by_ds: # Special case: empty repo. There's either an empty commit only or # none at all. An empty one we can amend otherwise there's nothing # to do. if amend and ds.repo.get_hexsha(): yield from save_ds((ds.pathobj, dict()), version_tag=version_tag) else: yield dict(action='save', type='dataset', path=ds.path, refds=ds.path, status='notneeded', logger=lgr) return # TODO: in principle logging could be improved to go not by a dataset # but by path(s) within subdatasets. That should provide a bit better ETA # and more "dynamic" feedback than jumpy datasets count. # See addurls where it is implemented that way by providing agg and another # log_filter yield from ProducerConsumerProgressLog( sorted(paths_by_ds.items(), key=lambda v: v[0], reverse=True), partial(save_ds, version_tag=version_tag), safe_to_consume=no_subds_in_futures, producer_future_key=lambda ds_items: ds_items[0], jobs=jobs, log_filter=_log_filter_save_dataset, unit="datasets", lgr=lgr, )
def only_matching_paths(res, **kwargs): # TODO handle relative paths by using a contained 'refds' value paths = ensure_list(kwargs.get('path', [])) respath = res.get('path', None) return respath in paths
def __call__(path=None, source=None, dataset=None, get_data=False, description=None, recursive=False, recursion_limit=None, reckless=None, jobs="auto"): # normalize path argument to be equal when called from cmdline and # python and nothing was passed into `path` path = ensure_list(path) if not source and not path: raise InsufficientArgumentsError( "Please provide at least a source or a path") # Common kwargs to pass to underlying git/install calls. # They might need adjustments (e.g. for recursion_limit, but # otherwise would be applicable throughout # # There should have been more of common options! # since underneath get could do similar installs common_kwargs = dict( get_data=get_data, recursive=recursive, recursion_limit=recursion_limit, # git_opts=git_opts, # annex_opts=annex_opts, reckless=reckless, jobs=jobs, ) # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. ds = None if dataset is not None: ds = require_dataset(dataset, check_installed=True, purpose='installation') common_kwargs['dataset'] = dataset # pre-compute for results below refds_path = Interface.get_refds_path(ds) # switch into the two scenarios without --source: # 1. list of URLs # 2. list of (sub)dataset content if source is None: # we need to collect URLs and paths to_install = [] to_get = [] # TODO: this approach is problematic, it disrupts the order of input args. # consequently results will be returned in an unexpected order when a # mixture of source URL and paths is given. Reordering is only possible when # everything in here is fully processed before any results can be yielded. # moreover, I think the semantics of the status quo implementation are a # bit complicated: in a mixture list a source URL will lead to a new dataset # at a generated default location, but a path will lead to a subdataset # at that exact location for urlpath in path: ri = RI(urlpath) (to_get if isinstance(ri, PathRI) else to_install).append(urlpath) # 1. multiple source URLs for s in to_install: lgr.debug("Install passes into install source=%s", s) for r in Install.__call__( source=s, description=description, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, result_filter=None, **common_kwargs): # no post-processing of the installed content on disk # should be necessary here, all done by code further # down that deals with an install from an actuall `source` # any necessary fixes should go there too! r['refds'] = refds_path yield r # 2. one or more dataset content paths if to_get: lgr.debug("Install passes into get %d items", len(to_get)) # all commented out hint on inability to pass those options # into underlying install-related calls. # Also need to pass from get: # annex_get_opts for r in Get.__call__( to_get, # TODO should pass-through description, not sure why disabled # description=description, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, result_filter=None, **common_kwargs): # no post-processing of get'ed content on disk should be # necessary here, this is the responsibility of `get` # (incl. adjusting parent's gitmodules when submodules end # up in an "updated" state (done in get helpers) # any required fixes should go there! r['refds'] = refds_path yield r # we are done here # the rest is about install from a `source` return # an actual `source` was given if source and path and len(path) > 1: # exception is ok here, if this fails it is either direct user error # or we f****d up one of our internal calls raise ValueError( "install needs a single PATH when source is provided. " "Was given mutliple PATHs: %s" % str(path)) # parameter constraints: if not source: # exception is ok here, if this fails it is either direct user error # or we f****d up one of our internal calls raise InsufficientArgumentsError( "a `source` is required for installation") # code below deals with a single path only path = path[0] if path else None if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination yield get_status_dict( 'install', path=path, status='impossible', logger=lgr, source_url=source, refds=refds_path, message= "installation `source` and destination `path` are identical. " "If you are trying to add a subdataset simply use the `save` command" ) return # resolve the target location (if local) against the provided dataset # or CWD: if path is not None: # MIH everything in here is highly similar to what common # interface helpers do (or should/could do), but at the same # is very much tailored to just apply to `install` -- I guess # it has to stay special # Should work out just fine for regular paths, so no additional # conditioning is necessary try: path_ri = RI(path) except Exception as e: raise ValueError("invalid path argument {}: ({})".format( path, exc_str(e))) try: # Wouldn't work for SSHRI ATM, see TODO within SSHRI # yoh: path should be a local path, and mapping note within # SSHRI about mapping localhost:path to path is kinda # a peculiar use-case IMHO # TODO Stringification can be removed once PY35 is no longer # supported path = str(resolve_path(path_ri.localpath, dataset)) # any `path` argument that point to something local now # resolved and is no longer a URL except ValueError: # `path` is neither a valid source nor a local path. # TODO: The only thing left is a known subdataset with a # name, that is not a path; Once we correctly distinguish # between path and name of a submodule, we need to consider # this. # For now: Just raise raise ValueError("Invalid path argument {0}".format(path)) # `path` resolved, if there was any. # clone dataset, will also take care of adding to superdataset, if one # is given res = Clone.__call__( source, path, dataset=ds, description=description, reckless=reckless, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" result_xfm=None, return_type='generator', result_filter=None, on_failure='ignore') # helper as_ds = YieldDatasets() destination_dataset = None for r in res: if r['action'] == 'install' and r['type'] == 'dataset': # make sure logic below is valid, only one dataset result is # coming back assert (destination_dataset is None) destination_dataset = as_ds(r) r['refds'] = refds_path yield r assert (destination_dataset) # Now, recursive calls: if recursive or get_data: # dataset argument must not be passed inside since we use bound .get # It is ok to do "inplace" as long as we still return right # after the loop ends common_kwargs.pop('dataset', '') for r in destination_dataset.get( curdir, description=description, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, **common_kwargs): r['refds'] = refds_path yield r # at this point no futher post-processing should be necessary, # `clone` and `get` must have done that (incl. parent handling) # if not, bugs should be fixed in those commands return
def diff_dataset( dataset, fr, to, constant_refs, path=None, annex=None, untracked='normal', recursive=False, recursion_limit=None, eval_file_type=True, reporting_order='depth-first'): """Internal helper to diff a dataset Parameters ---------- dataset : Dataset Dataset to perform the diff on. `fr` and `to` parameters are interpreted in the context of this dataset. fr : str Commit-ish to compare from. to : str Commit-ish to compare to. constant_refs : bool If True, `fr` and `to` will be passed on unmodified to diff operations on subdatasets. This can be useful with symbolic references like tags to report subdataset changes independent of superdataset changes. If False, `fr` and `to` will be translated to the subdataset commit-ish that match the given commit-ish in the superdataset. path : Path-like, optional Paths to constrain the diff to (see main diff() command). annex : str, optional Reporting mode for annex properties (see main diff() command). untracked : str, optional Reporting mode for untracked content (see main diff() command). recursive : bool, optional Flag to enable recursive operation (see main diff() command). recursion_limit : int, optional Recursion limit (see main diff() command). eval_file_type : bool, optional Whether to perform file type discrimination between real symlinks and symlinks representing annex'ed files. This can be expensive in datasets with many files. reporting_order : {'depth-first', 'breadth-first'}, optional By default, subdataset content records are reported after the record on the subdataset's submodule in a superdataset (depth-first). Alternatively, report all superdataset records first, before reporting any subdataset content records (breadth-first). Yields ------ dict DataLad result records. """ if reporting_order not in ('depth-first', 'breadth-first'): raise ValueError('Unknown reporting order: {}'.format(reporting_order)) ds = require_dataset( dataset, check_installed=True, purpose='difference reporting') # we cannot really perform any sorting of paths into subdatasets # or rejecting paths based on the state of the filesystem, as # we need to be able to compare with states that are not represented # in the worktree (anymore) if path: ps = [] # sort any path argument into the respective subdatasets for p in sorted(ensure_list(path)): # it is important to capture the exact form of the # given path argument, before any normalization happens # distinguish rsync-link syntax to identify # a dataset as whole (e.g. 'ds') vs its # content (e.g. 'ds/') # special case is the root dataset, always report its content # changes orig_path = str(p) resolved_path = resolve_path(p, dataset) p = \ resolved_path, \ orig_path.endswith(op.sep) or resolved_path == ds.pathobj str_path = str(p[0]) root = get_dataset_root(str_path) if root is None: # no root, not possibly underneath the refds yield dict( action='status', path=str_path, refds=ds.path, status='error', message='path not underneath this dataset', logger=lgr) continue if path_under_rev_dataset(ds, str_path) is None: # nothing we support handling any further # there is only a single refds yield dict( path=str_path, refds=ds.path, action='diff', status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", ds, str_path), logger=lgr, ) continue ps.append(p) path = ps # TODO we might want to move away from the single-pass+immediate-yield # paradigm for this command. If we gather all information first, we # could do post-processing and detect when a file (same gitsha, or same # key) was copied/moved from another dataset. Another command (e.g. # save) could act on this information and also move/copy # availability information or at least enhance the respective commit # message with cross-dataset provenance info # cache to help avoid duplicate status queries content_info_cache = {} for res in _diff_ds( ds, fr, to, constant_refs, recursion_limit if recursion_limit is not None and recursive else -1 if recursive else 0, # TODO recode paths to repo path reference origpaths=None if not path else OrderedDict(path), untracked=untracked, annexinfo=annex, eval_file_type=eval_file_type, cache=content_info_cache, order=reporting_order): res.update( refds=ds.path, logger=lgr, action='diff', ) yield res