def _remove_remote( ds, name, known_remotes, url, pushurl, fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): if not name: # TODO we could do ALL instead, but that sounds dangerous raise InsufficientArgumentsError("no sibling name given") result_props = dict( action='remove-sibling', path=ds.path, type='sibling', name=name, **res_kwargs) try: # failure can happen and is OK ds.repo.remove_remote(name) except RemoteNotAvailableError as e: yield get_status_dict( # result-oriented! given remote is absent already status='notneeded', **result_props) return yield get_status_dict( status='ok', **result_props)
def _remove_remote(ds, name, known_remotes, url, pushurl, fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): if not name: # TODO we could do ALL instead, but that sounds dangerous raise InsufficientArgumentsError("no sibling name given") result_props = dict(action='remove-sibling', path=ds.path, type='sibling', name=name, **res_kwargs) try: # failure can happen and is OK with swallow_logs(): ds.repo.remove_remote(name) except CommandError as e: if 'fatal: No such remote' in e.stderr: yield get_status_dict( # result-oriented! given remote is absent already status='notneeded', **result_props) return else: raise e yield get_status_dict(status='ok', **result_props)
def __call__(types, files=None, dataset=None): dataset = require_dataset(dataset or curdir, purpose="extract metadata", check_installed=not files) if not files: ds = require_dataset(dataset, check_installed=True) subds = ds.subdatasets(recursive=False, result_xfm='relpaths') files = list(_get_metadatarelevant_paths(ds, subds)) dsmeta, contentmeta, error = _get_metadata(dataset, types, global_meta=True, content_meta=bool(files), paths=files) if dataset is not None and dataset.is_installed(): res = get_status_dict(action='metadata', ds=dataset, refds=dataset.path, metadata=dsmeta, status='error' if error else 'ok') yield res for p in contentmeta: res = get_status_dict(action='metadata', path=opj(dataset.path, p) if dataset else p, refds=dataset.path, metadata=contentmeta[p], type='file', status='error' if error else 'ok') if dataset: res['parentds'] = dataset.path yield res
def _uninstall_dataset(ds, check, has_super, **kwargs): if check and ds.is_installed(): for r in _drop_files(ds, curdir, check=True, noannex_iserror=False, **kwargs): yield r # TODO: uninstall of a subdataset that has a local URL # (e.g. ./anything) implies cannot be undone, decide how, and # if to check for that # TODO check that the relevant branched are pushed to a remote if ds.subdatasets(fulfilled=True): yield get_status_dict( status='error', ds=ds, message= ('to be uninstalled dataset %s has present subdatasets, forgot --recursive?', ds), **kwargs) return # Close any possibly associated process etc with underlying repo. # Otherwise - rmtree could fail to remove e.g. under NFS which would # still have some files opened by them (thus having .nfs00000xxxx # files) forbidding rmdir to work in rmtree ds.close() if ds.is_installed(): rmtree(ds.path) if has_super and not exists(ds.path): # recreate an empty mountpoint to make Git happier os.makedirs(ds.path) # invalidate loaded ConfigManager: ds._cfg = None yield get_status_dict(status='ok', ds=ds, **kwargs)
def _install_necessary_subdatasets( ds, path, reckless, refds_path, description=None): """Installs subdatasets of `ds`, that are necessary to obtain in order to have access to `path`. Gets the subdataset containing `path` regardless of whether or not it was already installed. While doing so, installs everything necessary in between the uppermost installed one and `path`. Note: `ds` itself has to be installed. Parameters ---------- ds: Dataset path: str reckless: bool """ # figuring out what dataset to start with, --contains limits --recursive # to visit only subdataset on the trajectory to the target path subds_trail = ds.subdatasets(contains=path, recursive=True) if not subds_trail: # there is not a single known subdataset (installed or not) # for this path -- job done return # otherwise we start with the one deepest down cur_subds = subds_trail[-1] while not GitRepo.is_valid_repo(cur_subds['path']): # install using helper that give some flexibility regarding where to # get the module from try: sd = _install_subds_from_flexible_source( Dataset(cur_subds['parentds']), relpath(cur_subds['path'], start=cur_subds['parentds']), cur_subds['gitmodule_url'], reckless, description=description) except Exception as e: # skip all of downstairs, if we didn't manage to install subdataset yield get_status_dict( 'install', path=cur_subds['path'], type='dataset', status='error', logger=lgr, refds=refds_path, message=("Installation of subdatasets %s failed with exception: %s", cur_subds['path'], exc_str(e))) return # report installation, whether it helped or not yield get_status_dict( 'install', ds=sd, status='ok', logger=lgr, refds=refds_path, message=("Installed subdataset in order to get %s", path)) # now check whether the just installed subds brought us any closer to # the target path subds_trail = sd.subdatasets(contains=path, recursive=False) if not subds_trail: # no (newly available) subdataset get's us any closer return # next round cur_subds = subds_trail[-1]
def __call__(title, name="osf", dataset=None, mode="annex"): ds = require_dataset(dataset, purpose="create OSF remote", check_installed=True) # we need an annex if not isinstance(ds.repo, AnnexRepo): yield get_status_dict(action="create-sibling-osf", type="dataset", status="impossible", message="dataset has no annex") return # NOTES: # - we prob. should check osf-special-remote availability upfront to # fail early # - publish-depends option? # - (try to) detect github/gitlab/bitbucket to suggest linking it on # OSF and configure publish dependency # -> prob. overkill; just make it clear in the doc # - add --recursive option # - recursive won't work easily. Need to think that through. # - would need a naming scheme for subdatasets # - flat on OSF or a tree? # - how do we detect something is there already, so we can skip # rather than duplicate (with a new name)? # osf-type-special-remote sufficient to decide it's not needed? # - adapt to conclusions in issue #30 # -> create those subcomponents # - results need to report URL for created projects suitable for datalad # output formatting! # -> result_renderer # -> needs to ne returned by create_project # - option: Make public! cred = get_credentials(allow_interactive=True) osf = OSF(**cred) proj_id, proj_url = create_project(osf_session=osf.session, title=title) yield get_status_dict(action="create-project-osf", type="dataset", url=proj_url, id=proj_id, status="ok") init_opts = [ "encryption=none", "type=external", "externaltype=osf", "autoenable=true", "project={}".format(proj_id) ] if mode == "export": init_opts += ["exporttree=yes"] ds.repo.init_remote(name, options=init_opts) # TODO: add special remote name to result? # need to check w/ datalad-siblings conventions yield get_status_dict(action="add-sibling-osf", type="dataset", status="ok")
def _install_necessary_subdatasets( ds, path, reckless, refds_path, description=None): """Installs subdatasets of `ds`, that are necessary to obtain in order to have access to `path`. Gets the subdataset containing `path` regardless of whether or not it was already installed. While doing so, installs everything necessary in between the uppermost installed one and `path`. Note: `ds` itself has to be installed. Parameters ---------- ds: Dataset path: str reckless: bool """ # figuring out what dataset to start with, --contains limits --recursive # to visit only subdataset on the trajectory to the target path subds_trail = ds.subdatasets(contains=path, recursive=True) if not subds_trail: # there is not a single known subdataset (installed or not) # for this path -- job done return # otherwise we start with the one deepest down cur_subds = subds_trail[-1] while not GitRepo.is_valid_repo(cur_subds['path']): # install using helper that give some flexibility regarding where to # get the module from try: sd = _install_subds_from_flexible_source( Dataset(cur_subds['parentds']), relpath(cur_subds['path'], start=cur_subds['parentds']), cur_subds['gitmodule_url'], reckless, description=description) except Exception as e: # skip all of downstairs, if we didn't manage to install subdataset yield get_status_dict( 'install', path=cur_subds['path'], type='dataset', status='error', logger=lgr, refds=refds_path, message=("Installation of subdatasets %s failed with exception: %s", cur_subds['path'], exc_str(e))) return # report installation, whether it helped or not yield get_status_dict( 'install', ds=sd, status='ok', logger=lgr, refds=refds_path, message=("Installed subdataset in order to get %s", path)) # now check whether the just installed subds brought us any closer to # the target path subds_trail = sd.subdatasets(contains=path, recursive=False) if not subds_trail: # no (newly available) subdataset get's us any closer return # next round cur_subds = subds_trail[-1]
def __call__(dataset=None, what=None, recursive=False, recursion_limit=None): ds = require_dataset(dataset, purpose='clean-up') res_kwargs = dict(action='clean', logger=lgr, refds=ds.path) for ap in AnnotatePaths.__call__(dataset=ds.path, recursive=recursive, recursion_limit=recursion_limit, action='clean', unavailable_path_status='impossible', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): yield ap continue if ap.get('type', None) != 'dataset': ap.update(status='impossible', message='only datasets can be cleaned') yield ap continue d = ap['path'] gitdir = get_git_dir(d) for dirpath, flag, msg, sing_pl in [ (ARCHIVES_TEMP_DIR, "cached-archives", "temporary archive", ("directory", "directories")), (ANNEX_TEMP_DIR, "annex-tmp", "temporary annex", ("file", "files")), (opj(gitdir, SEARCH_INDEX_DOTGITDIR), 'search-index', "metadata search index", ("file", "files")), ]: topdir = opj(d, dirpath) lgr.debug("Considering to clean %s:%s", d, dirpath) if not ((what is None) or (flag in what)): yield get_status_dict(path=topdir, status='notneeded', type='directory', **res_kwargs) continue paths = glob(opj(topdir, '*')) if not paths: yield get_status_dict(path=topdir, status='notneeded', type='directory', **res_kwargs) continue pl = len(paths) > 1 message = ("Removed %d %s %s: %s", len(paths), msg, sing_pl[int(pl)], ", ".join( sorted([x[len(topdir) + 1:] for x in paths]))) rmtree(topdir) yield get_status_dict(path=topdir, status='ok', type='dir', message=message, **res_kwargs)
def _uninstall_dataset(ds, check, has_super, **kwargs): cwd = Path.cwd() if ds.pathobj == cwd or ds.pathobj in cwd.parents: yield get_status_dict( status='error', ds=ds, message='refusing to uninstall a dataset at or above the ' 'current working directory', **kwargs) return if check and ds.is_installed(): # if the checks are on we need to make sure to exit this function # whenever any drop failed, because we cannot rely on the error # to actually cause a stop in upstairs code bad_things_happened = False for r in _drop_files(ds, op.curdir, check=True, noannex_iserror=False, **kwargs): yield r if r['action'] == 'drop' and \ not r.get('status', None) in ('ok', 'notneeded'): bad_things_happened = True if bad_things_happened: # error reporting already happened, we can just stop here return # TODO: uninstall of a subdataset that has a local URL # (e.g. ./anything) implies cannot be undone, decide how, and # if to check for that # TODO check that the relevant branched are pushed to a remote if ds.subdatasets(fulfilled=True): yield get_status_dict( status='error', ds=ds, message= ('to be uninstalled dataset %s has present subdatasets, forgot --recursive?', ds), **kwargs) return # Close any possibly associated process etc with underlying repo. # Otherwise - rmtree could fail to remove e.g. under NFS which would # still have some files opened by them (thus having .nfs00000xxxx # files) forbidding rmdir to work in rmtree ds.close() if ds.is_installed(): rmtree(ds.path) if has_super and not op.exists(ds.path): # recreate an empty mountpoint to make Git happier os.makedirs(ds.path) # invalidate loaded ConfigManager: ds._cfg = None yield get_status_dict(status='ok', ds=ds, **kwargs)
def __call__(dataset=None, what=None, recursive=False, recursion_limit=None): ds = require_dataset(dataset, purpose='clean-up') res_kwargs = dict(action='clean', logger=lgr, refds=ds.path) for ap in AnnotatePaths.__call__( dataset=ds.path, recursive=recursive, recursion_limit=recursion_limit, action='clean', unavailable_path_status='impossible', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): yield ap continue if ap.get('type', None) != 'dataset': ap.update(status='impossible', message='only datasets can be cleaned') yield ap continue d = ap['path'] gitdir = GitRepo.get_git_dir(d) DIRS_PLURAL = ("directory", "directories") FILES_PLURAL = ("file", "files") for dirpath, flag, msg, sing_pl in [ (ARCHIVES_TEMP_DIR, "cached-archives", "temporary archive", DIRS_PLURAL), (ANNEX_TEMP_DIR, "annex-tmp", "temporary annex", FILES_PLURAL), (ANNEX_TRANSFER_DIR, "annex-transfer", "annex temporary transfer", DIRS_PLURAL), (opj(gitdir, SEARCH_INDEX_DOTGITDIR), 'search-index', "metadata search index", FILES_PLURAL), ]: topdir = opj(d, dirpath) lgr.debug("Considering to clean %s:%s", d, dirpath) if not ((what is None) or (flag in what)): yield get_status_dict( path=topdir, status='notneeded', type='directory', **res_kwargs) continue paths = glob(opj(topdir, '*')) if not paths: yield get_status_dict( path=topdir, status='notneeded', type='directory', **res_kwargs) continue pl = len(paths) > 1 message = ("Removed %d %s %s: %s", len(paths), msg, sing_pl[int(pl)], ", ".join(sorted([x[len(topdir) + 1:] for x in paths]))) rmtree(topdir) yield get_status_dict( path=topdir, status='ok', type='dir', message=message, **res_kwargs)
def __call__(dataset=None, what=None, recursive=False, recursion_limit=None): ds = require_dataset(dataset, purpose='clean-up') res_kwargs = dict(action='clean', logger=lgr, refds=ds.path) for wds in itertools.chain( [ds], ds.subdatasets(fulfilled=True, recursive=recursive, recursion_limit=recursion_limit, return_type='generator', result_renderer='disabled', result_xfm='datasets') if recursive else []): d = wds.path gitdir = GitRepo.get_git_dir(d) DIRS_PLURAL = ("directory", "directories") FILES_PLURAL = ("file", "files") for dirpath, flag, msg, sing_pl in [ (ARCHIVES_TEMP_DIR, "cached-archives", "temporary archive", DIRS_PLURAL), (ANNEX_TEMP_DIR, "annex-tmp", "temporary annex", FILES_PLURAL), (ANNEX_TRANSFER_DIR, "annex-transfer", "annex temporary transfer", DIRS_PLURAL), (opj(gitdir, SEARCH_INDEX_DOTGITDIR), 'search-index', "metadata search index", FILES_PLURAL), ]: topdir = opj(d, dirpath) lgr.debug("Considering to clean %s:%s", d, dirpath) if not ((what is None) or (flag in what)): yield get_status_dict(path=topdir, status='notneeded', type='directory', **res_kwargs) continue paths = glob(opj(topdir, '*')) if not paths: yield get_status_dict(path=topdir, status='notneeded', type='directory', **res_kwargs) continue pl = len(paths) > 1 message = ("Removed %d %s %s: %s", len(paths), msg, sing_pl[int(pl)], ", ".join( sorted([x[len(topdir) + 1:] for x in paths]))) rmtree(topdir) yield get_status_dict(path=topdir, status='ok', type='dir', message=message, **res_kwargs)
def _recursive_install_subds_underneath(ds, recursion_limit, reckless, start=None, refds_path=None, description=None): if isinstance(recursion_limit, int) and recursion_limit <= 0: return # install using helper that give some flexibility regarding where to # get the module from for sub in ds.subdatasets( return_type='generator', result_renderer='disabled'): subds = Dataset(sub['path']) if sub.get('gitmodule_datalad-recursiveinstall', '') == 'skip': lgr.debug( "subdataset %s is configured to be skipped on recursive installation", sub['path']) continue if start is not None and not path_is_subpath(subds.path, start): # this one we can ignore, not underneath the start path continue if sub.get('state', None) != 'absent': # dataset was already found to exist yield get_status_dict( 'install', ds=subds, status='notneeded', logger=lgr, refds=refds_path) # do not continue, even if an intermediate dataset exists it # does not imply that everything below it does too else: # try to get this dataset try: subds = _install_subds_from_flexible_source( ds, relpath(sub['path'], start=ds.path), sub['gitmodule_url'], reckless, description=description) yield get_status_dict( 'install', ds=subds, status='ok', logger=lgr, refds=refds_path, message=("Installed subdataset %s", subds), parentds=ds.path) except Exception as e: # skip all of downstairs, if we didn't manage to install subdataset yield get_status_dict( 'install', ds=subds, status='error', logger=lgr, refds=refds_path, message=("Installation of subdatasets %s failed with exception: %s", subds, exc_str(e))) continue # otherwise recurse # we can skip the start expression, we know we are within for res in _recursive_install_subds_underneath( subds, recursion_limit=recursion_limit - 1 if isinstance(recursion_limit, int) else recursion_limit, reckless=reckless, refds_path=refds_path): yield res
def _recursive_install_subds_underneath(ds, recursion_limit, reckless, start=None, refds_path=None, description=None): if isinstance(recursion_limit, int) and recursion_limit <= 0: return # install using helper that give some flexibility regarding where to # get the module from for sub in ds.subdatasets( return_type='generator', result_renderer='disabled'): subds = Dataset(sub['path']) if sub.get('gitmodule_datalad-recursiveinstall', '') == 'skip': lgr.debug( "subdataset %s is configured to be skipped on recursive installation", sub['path']) continue if start is not None and not path_is_subpath(subds.path, start): # this one we can ignore, not underneath the start path continue if sub.get('state', None) != 'absent': # dataset was already found to exist yield get_status_dict( 'install', ds=subds, status='notneeded', logger=lgr, refds=refds_path) # do not continue, even if an intermediate dataset exists it # does not imply that everything below it does too else: # try to get this dataset try: subds = _install_subds_from_flexible_source( ds, relpath(sub['path'], start=ds.path), sub['gitmodule_url'], reckless, description=description) yield get_status_dict( 'install', ds=subds, status='ok', logger=lgr, refds=refds_path, message=("Installed subdataset %s", subds), parentds=ds.path) except Exception as e: # skip all of downstairs, if we didn't manage to install subdataset yield get_status_dict( 'install', ds=subds, status='error', logger=lgr, refds=refds_path, message=("Installation of subdatasets %s failed with exception: %s", subds, exc_str(e))) continue # otherwise recurse # we can skip the start expression, we know we are within for res in _recursive_install_subds_underneath( subds, recursion_limit=recursion_limit - 1 if isinstance(recursion_limit, int) else recursion_limit, reckless=reckless, refds_path=refds_path): yield res
def __call__(paths, *, reference_date="@1514764800", revs=None, annex="all", no_tags=False, older=False): from datalad.support.repodates import check_dates which = "older" if older else "newer" try: ref_ts = _parse_date(reference_date) except ValueError as exc: lgr.error("Could not parse '%s' as a date", reference_date) ce = CapturedException(exc) yield get_status_dict("check_dates", status="error", message=str(ce), exception=ce) return lgr.info("Searching for dates %s than %s", which, time.strftime("%d %b %Y %H:%M:%S +0000", time.gmtime(ref_ts))) for repo in _git_repos(paths or ["."]): fullpath = os.path.abspath(repo) lgr.debug("Checking %s", fullpath) try: report = check_dates(repo, ref_ts, which=which, revs=revs or ["--all"], annex={"all": True, "none": False, "tree": "tree"}[annex], tags=not no_tags) except InvalidGitRepositoryError as exc: lgr.warning("Skipping invalid Git repo: %s", repo) continue yield get_status_dict( "check_dates", status="ok", path=fullpath, message=("Found {} dates" if report["objects"] else "No {} dates found").format(which), report=report)
def get_dataset_reponame_mapping(self, ds, name, reponame, existing, recursive, recursion_limit, res_kwargs): """Discover all relevant datasets locally, and build remote repo names """ dss = _get_present_datasets(ds, recursive, recursion_limit) # check for existing remote configuration toprocess = [] toyield = [] for d in dss: if existing not in ('reconfigure', 'replace') and \ name in d.repo.get_remotes(): toyield.append( get_status_dict( ds=d, status='error' if existing == 'error' else 'notneeded', message=('already has a configured sibling "%s"', name), **res_kwargs)) continue gh_reponame = reponame if d == ds else \ '{}-{}'.format( reponame, self.normalize_reponame( str(d.pathobj.relative_to(ds.pathobj)))) toprocess.append((d, gh_reponame)) return toprocess, toyield
def __call__(): """ """ content = """\ # Universal completion script for DataLad with the core autogenerated by # python-argcomplete and only slightly improved to work for ZSH if sourced under ZSH. # # Instead of just running this command and seeing this output, do # # source <(datalad shell-completion) # # in your bash or zsh session. if [ "${ZSH_VERSION:-}" != "" ]; then autoload -U compinit && compinit autoload -U bashcompinit && bashcompinit fi _python_argcomplete() { local IFS=' ' COMPREPLY=( $(IFS="$IFS" COMP_LINE="$COMP_LINE" COMP_POINT="$COMP_POINT" _ARGCOMPLETE_COMP_WORDBREAKS="$COMP_WORDBREAKS" _ARGCOMPLETE=1 "$1" 8>&1 9>&2 1>/dev/null 2>/dev/null) ) if [[ $? != 0 ]]; then unset COMPREPLY fi } complete -o nospace -o default -F _python_argcomplete "datalad" """ yield get_status_dict(action='shell_completion', status='ok', content=content)
def prepare_inputs(dset_path, inputs, extra_inputs=None): """Prepare `inputs` for running a command. This consists of installing required subdatasets and getting the input files. Parameters ---------- dset_path : str inputs : GlobbedPaths object extra_inputs : GlobbedPaths object, optional Returns ------- Generator with the result records. """ dset_path = _dset_arg_kludge(dset_path) gps = list(filter(bool, [inputs, extra_inputs])) if gps: lgr.info('Making sure inputs are available (this may take some time)') get = Get() for gp in gps: for res in _install_and_reglob(dset_path, gp): yield res if gp.misses: ds = Dataset(dset_path) for miss in gp.misses: yield get_status_dict( action="run", ds=ds, status="error", message=("Input did not match existing file: %s", miss)) yield from get(dataset=dset_path, path=gp.expand_strict(), on_failure="ignore")
def consumer(ds_path__sub__limit): ds_path, sub, recursion_limit = ds_path__sub__limit subds = Dataset(sub['path']) if sub.get('state', None) != 'absent': rec = get_status_dict('install', ds=subds, status='notneeded', logger=lgr, refds=refds_path) subs_notneeded.append(rec) yield rec # do not continue, even if an intermediate dataset exists it # does not imply that everything below it does too else: # TODO: here we need another "ds"! is it within "sub"? yield from _install_subds_from_flexible_source( Dataset(ds_path), sub, reckless=reckless, description=description) if not subds.is_installed(): # an error result was emitted, and the external consumer can decide # what to do with it, but there is no point in recursing into # something that should be there, but isn't lgr.debug('Subdataset %s could not be installed, skipped', subds) return # recurse # we can skip the start expression, we know we are within for res in _recursive_install_subds_underneath( subds, recursion_limit=recursion_limit - 1 if isinstance(recursion_limit, int) else recursion_limit, reckless=reckless, refds_path=refds_path, jobs=jobs, producer_only=True # we will be adding to producer queue ): producer_consumer.add_to_producer_queue(res)
def _revrange_as_results(dset, revrange): ds_repo = dset.repo rev_lines = ds_repo.get_revisions( revrange, fmt="%H %P", options=["--reverse", "--topo-order"]) if not rev_lines: return for rev_line in rev_lines: # The strip() below is necessary because, with the format above, a # commit without any parent has a trailing space. (We could also use a # custom `rev-list --parents ...` call to avoid this.) fields = rev_line.strip().split(" ") rev, parents = fields[0], fields[1:] res = get_status_dict("run", ds=dset, commit=rev, parents=parents) full_msg = ds_repo.format_commit("%B", rev) try: msg, info = get_run_info(dset, full_msg) except ValueError as exc: # Recast the error so the message includes the revision. raise ValueError( "Error on {}'s message".format(rev)) from exc if info is not None: if len(parents) != 1: lgr.warning( "%s has run information but is a %s commit; " "it will not be re-executed", rev, "merge" if len(parents) > 1 else "root") continue res["run_info"] = info res["run_message"] = msg yield dict(res, status="ok")
def __call__(paths, reference_date="@1514764800", revs=None, annex="all", no_tags=False, older=False): from datalad.support.repodates import check_dates which = "older" if older else "newer" try: ref_ts = _parse_date(reference_date) except ValueError as exc: lgr.error("Could not parse '%s' as a date", reference_date) yield get_status_dict("check_dates", status="error", message=exc_str(exc)) return lgr.info("Searching for dates %s than %s", which, time.strftime("%d %b %Y %H:%M:%S +0000", time.gmtime(ref_ts))) for repo in _git_repos(paths or ["."]): fullpath = os.path.abspath(repo) lgr.debug("Checking %s", fullpath) try: report = check_dates(repo, ref_ts, which=which, revs=revs or ["--all"], annex={"all": True, "none": False, "tree": "tree"}[annex], tags=not no_tags) except InvalidGitRepositoryError as exc: lgr.warning("Skipping invalid Git repo: %s", repo) continue yield get_status_dict( "check_dates", status="ok", path=fullpath, message=("Found {} dates" if report["objects"] else "No {} dates found").format(which), report=report)
def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs): """Helper to drop content in datasets. Parameters ---------- ds : Dataset paths : path or list(path) which content to drop check : bool whether to instruct annex to perform minimum copy availability checks noannex_iserror : bool whether calling this function on a pure Git repo results in an 'impossible' or 'notneeded' result. **kwargs additional payload for the result dicts """ # expensive, access only once ds_repo = ds.repo if 'action' not in kwargs: kwargs['action'] = 'drop' # always need to make sure that we pass a list # `normalize_paths` decorator will otherwise screw all logic below paths = ensure_list(paths) if not hasattr(ds_repo, 'drop'): for p in paths: r = get_status_dict( status='impossible' if noannex_iserror else 'notneeded', path=p if isabs(p) else normpath(opj(ds.path, p)), message="no annex'ed content", **kwargs) r['action'] = 'drop' yield r return cmd = ['drop'] if not check: cmd.append('--force') respath_by_status = {} try: yield from (_postproc_result(res, respath_by_status, ds) for res in ds_repo._call_annex_records(cmd, files=paths)) except CommandError as e: # pick up the results captured so far and yield them # the error will be amongst them yield from (_postproc_result(res, respath_by_status, ds) for res in e.kwargs.get('stdout_json', [])) # report on things requested that annex was silent about for r in results_from_annex_noinfo( ds, paths, respath_by_status, dir_fail_msg='could not drop some content in %s %s', noinfo_dir_msg='nothing to drop from %s', noinfo_file_msg="no annex'ed content", **kwargs): r['action'] = 'drop' yield r
def __call__(dataset=None, sensitive=None, clipboard=None): from datalad.distribution.dataset import require_dataset from datalad.support.exceptions import NoDatasetArgumentFound from datalad.interface.results import get_status_dict ds = None try: ds = require_dataset(dataset, check_installed=False, purpose='reporting') except NoDatasetArgumentFound: # failure is already logged pass if ds and not ds.is_installed(): # we don't deal with absent datasets ds = None if sensitive: if ds is None: from datalad import cfg else: cfg = ds.config else: cfg = None from datalad.ui import ui from datalad.support.external_versions import external_versions infos = {} res = get_status_dict( action='wtf', path=ds.path if ds else op.abspath(op.curdir), type='dataset' if ds else 'directory', status='ok', logger=lgr, infos=infos, ) infos['datalad'] = _describe_datalad() infos['git-annex'] = _describe_annex() infos['system'] = _describe_system() infos['environment'] = _describe_environment() infos['configuration'] = _describe_configuration(cfg, sensitive) infos['extentions'] = _describe_extensions() infos['metadata_extractors'] = _describe_metadata_extractors() infos['dependencies'] = _describe_dependencies() if ds: try: infos['dataset'] = _describe_dataset(ds, sensitive) except InvalidGitRepositoryError as e: infos['dataset'] = {"invalid": exc_str(e)} if clipboard: external_versions.check( 'pyperclip', msg="It is needed to be able to use clipboard") import pyperclip report = _render_report(res) pyperclip.copy(report) ui.message("WTF information of length %s copied to clipboard" % len(report)) yield res return
def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs): """Helper to drop content in datasets. Parameters ---------- ds : Dataset paths : path or list(path) which content to drop check : bool whether to instruct annex to perform minimum copy availability checks noannex_iserror : bool whether calling this function on a pure Git repo results in an 'impossible' or 'notneeded' result. **kwargs additional payload for the result dicts """ if 'action' not in kwargs: kwargs['action'] = 'drop' # always need to make sure that we pass a list # `normalize_paths` decorator will otherwise screw all logic below paths = assure_list(paths) if not hasattr(ds.repo, 'drop'): for p in paths: r = get_status_dict( status='impossible' if noannex_iserror else 'notneeded', path=p if isabs(p) else normpath(opj(ds.path, p)), message="no annex'ed content", **kwargs) r['action'] = 'drop' yield r return opts = ['--force'] if not check else [] respath_by_status = {} for res in ds.repo.drop(paths, options=opts): res = annexjson2result( # annex reports are always about files res, ds, type='file', **kwargs) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res # report on things requested that annex was silent about for r in results_from_annex_noinfo( ds, paths, respath_by_status, dir_fail_msg='could not drop some content in %s %s', noinfo_dir_msg='nothing to drop from %s', noinfo_file_msg="no annex'ed content", **kwargs): r['action'] = 'drop' yield r
def __call__(dataset=None, recursive=False, contains=None): ds = require_dataset(dataset, check_installed=True, purpose='list containers') refds = ds.path if recursive: for sub in ds.subdatasets( contains=contains, on_failure='ignore', return_type='generator', result_renderer='disabled'): subds = Dataset(sub['path']) if subds.is_installed(): for c in subds.containers_list(recursive=recursive, return_type='generator', on_failure='ignore', result_filter=None, result_renderer=None, result_xfm=None): c['name'] = sub['gitmodule_name'] + '/' + c['name'] c['refds'] = refds yield c # all info is in the dataset config! var_prefix = 'datalad.containers.' containers = {} for var, value in ds.config.items(): if not var.startswith(var_prefix): # not an interesting variable continue var_comps = var[len(var_prefix):].split('.') cname = var_comps[0] ccfgname = '.'.join(var_comps[1:]) if not ccfgname: continue cinfo = containers.get(cname, {}) cinfo[ccfgname] = value containers[cname] = cinfo for k, v in containers.items(): if 'image' not in v: # there is no container location configured continue res = get_status_dict( status='ok', action='containers', name=k, type='file', path=op.join(ds.path, v.pop('image')), refds=refds, parentds=ds.path, # TODO #state='absent' if ... else 'present' **v) yield res
def fn(dset, results): ds_repo = dset.repo header = """\ #!/bin/sh # # This file was generated by running (the equivalent of) # # datalad rerun --script={script}{since} {revision} # # in {ds}{path}\n""" ofh.write(header.format( script=script, since="" if since is None else " --since=" + since, revision=ds_repo.get_hexsha(revision), ds='dataset {} at '.format(dset.id) if dset.id else '', path=dset.path)) for res in results: if res["status"] != "ok": yield res return if "run_info" not in res: continue run_info = res["run_info"] cmd = run_info["cmd"] expanded_cmd = format_command( dset, cmd, **dict(run_info, dspath=dset.path, pwd=op.join(dset.path, run_info["pwd"]))) msg = res["run_message"] if msg == _format_cmd_shorty(expanded_cmd): msg = '' ofh.write( "\n" + "".join("# " + ln for ln in msg.splitlines(True)) + "\n") commit_descr = ds_repo.describe(res["commit"]) ofh.write('# (record: {})\n'.format( commit_descr if commit_descr else res["commit"])) ofh.write(expanded_cmd + "\n") if ofh is not sys.stdout: ofh.close() if ofh is sys.stdout: yield None else: yield get_status_dict( "run", ds=dset, status="ok", path=script, message=("Script written to %s", script))
def add_urls(rows, ifexists=None, options=None): """Call `git annex addurl` using information in `rows`. """ for row in rows: filename_abs = row["filename_abs"] ds, filename = row["ds"], row["ds_filename"] lgr.debug("Adding metadata to %s in %s", filename, ds.path) if os.path.exists(filename_abs) or os.path.islink(filename_abs): if ifexists == "skip": yield get_status_dict(action="addurls", ds=ds, type="file", path=filename_abs, status="notneeded") continue elif ifexists == "overwrite": lgr.debug("Removing %s", filename_abs) unlink(filename_abs) else: lgr.debug("File %s already exists", filename_abs) try: out_json = ds.repo.add_url_to_file(filename, row["url"], batch=True, options=options) except AnnexBatchCommandError as exc: yield get_status_dict(action="addurls", ds=ds, type="file", path=filename_abs, message=exc_str(exc), status="error") continue # In the case of an error, the json object has file=None. if out_json["file"] is None: out_json["file"] = filename_abs yield annexjson2result(out_json, ds, action="addurls", type="file", logger=lgr)
def _uninstall_dataset(ds, check, has_super, **kwargs): if check and ds.is_installed(): # if the checks are on we need to make sure to exit this function # whenever any drop failed, because we cannot rely on the error # to actually cause a stop in upstairs code bad_things_happened = False for r in _drop_files( ds, curdir, check=True, noannex_iserror=False, **kwargs): yield r if r['action'] == 'drop' and \ not r.get('status', None) in ('ok', 'notneeded'): bad_things_happened = True if bad_things_happened: # error reporting already happened, we can just stop here return # we want to use the bound dataset method from datalad.distribution.subdatasets import Subdatasets # TODO: uninstall of a subdataset that has a local URL # (e.g. ./anything) implies cannot be undone, decide how, and # if to check for that # TODO check that the relevant branched are pushed to a remote if ds.subdatasets(fulfilled=True): yield get_status_dict( status='error', ds=ds, message=( 'to be uninstalled dataset %s has present subdatasets, forgot --recursive?', ds), **kwargs) return # Close any possibly associated process etc with underlying repo. # Otherwise - rmtree could fail to remove e.g. under NFS which would # still have some files opened by them (thus having .nfs00000xxxx # files) forbidding rmdir to work in rmtree ds.close() if ds.is_installed(): rmtree(ds.path) if has_super and not exists(ds.path): # recreate an empty mountpoint to make Git happier os.makedirs(ds.path) # invalidate loaded ConfigManager: ds._cfg = None yield get_status_dict(status='ok', ds=ds, **kwargs)
def fn(dset, results): header = """\ #!/bin/sh # # This file was generated by running (the equivalent of) # # datalad rerun --script={script}{since} {revision} # # in {ds}{path}\n""" ofh.write(header.format( script=script, since="" if since is None else " --since=" + since, revision=dset.repo.get_hexsha(revision), ds='dataset {} at '.format(dset.id) if dset.id else '', path=dset.path)) for res in results: if res["status"] != "ok": yield res return if "run_info" not in res: continue run_info = res["run_info"] cmd = run_info["cmd"] expanded_cmd = format_command( dset, cmd, **dict(run_info, dspath=dset.path, pwd=op.join(dset.path, run_info["pwd"]))) msg = res["run_message"] if msg == _format_cmd_shorty(expanded_cmd): msg = '' ofh.write( "\n" + "".join("# " + ln for ln in msg.splitlines(True)) + "\n") commit_descr = dset.repo.describe(res["commit"]) ofh.write('# (record: {})\n'.format( commit_descr if commit_descr else res["commit"])) ofh.write(expanded_cmd + "\n") if ofh is not sys.stdout: ofh.close() if ofh is sys.stdout: yield None else: yield get_status_dict( "run", ds=dset, status="ok", path=script, message=("Script written to %s", script))
def configuration(action, scope, specs, res_kwargs, ds=None): if scope == 'global' or (action == 'dump' and ds is None): cfg = dlcfg else: cfg = ds.config if action not in config_actions: raise ValueError("Unsupported action '{}'".format(action)) if action == 'dump': if not specs: # dumping is querying for all known keys specs = [(n,) for n in sorted(set(cfg_defs.keys()).union(cfg.keys()))] scope = None for spec in specs: if '.' not in spec[0]: yield get_status_dict( ds=ds, status='error', message=( "Configuration key without a section: '%s'", spec[0], ), **res_kwargs) continue # TODO without get-all there is little sense in having add #if action == 'add': # res = _add(cfg, scope, spec) if action == 'get': res = _get(cfg, scope, spec[0]) elif action == 'dump': res = _dump(cfg, spec[0]) # TODO this should be there, if we want to be comprehensive # however, we turned this off by default in the config manager # because we hardly use it, and the handling in ConfigManager # is not really well done. #elif action == 'get-all': # res = _get_all(cfg, scope, spec) elif action == 'set': res = _set(cfg, scope, *spec) elif action == 'unset': res = _unset(cfg, scope, spec[0]) if ds: res['path'] = ds.path if 'status' not in res: res['status'] = 'ok' yield dict(res_kwargs, **res) if action in ('add', 'set', 'unset'): # we perform a single reload, rather than one for each modification # TODO: can we detect a call from cmdline? We could skip the reload. cfg.reload(force=True)
def __call__(dataset=None): ds = EnsureDataset()(dataset) assert isinstance(ds, RevolutionDataset) from datalad.tests.utils import assert_raises assert_raises(NotImplementedError, ds.repo.dirty) yield get_status_dict( action='demo', path=op.abspath(op.curdir), status='ok', )
def _add_remote( ds, name, known_remotes, url, pushurl, fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): # TODO: allow for no url if 'inherit' and deduce from the super ds # create-sibling already does it -- generalize/use # Actually we could even inherit/deduce name from the super by checking # which remote it is actively tracking in current branch... but may be # would be too much magic # it seems that the only difference is that `add` should fail if a remote # already exists if (url is None and pushurl is None): raise InsufficientArgumentsError( """insufficient information to add a sibling (needs at least a dataset, and any URL).""") if url is None: url = pushurl if not name: urlri = RI(url) # use the hostname as default remote name name = urlri.hostname lgr.debug( "No sibling name given, use URL hostname '%s' as sibling name", name) if not name: raise InsufficientArgumentsError("no sibling name given") if name in known_remotes: yield get_status_dict( action='add-sibling', status='error', path=ds.path, type='sibling', name=name, message=("sibling is already known: %s, use `configure` instead?", name), **res_kwargs) return # this remote is fresh: make it known # just minimalistic name and URL, the rest is coming from `configure` ds.repo.add_remote(name, url) known_remotes.append(name) # always copy signature from above to avoid bugs for r in _configure_remote( ds, name, known_remotes, url, pushurl, fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): if r['action'] == 'configure-sibling': r['action'] = 'add-sibling' yield r
def __call__(dataset=None, sensitive=None, clipboard=None): from datalad.distribution.dataset import require_dataset from datalad.support.exceptions import NoDatasetArgumentFound from datalad.interface.results import get_status_dict ds = None try: ds = require_dataset(dataset, check_installed=False, purpose='reporting') except NoDatasetArgumentFound: # failure is already logged pass if ds and not ds.is_installed(): # we don't deal with absent datasets ds = None if sensitive: if ds is None: from datalad import cfg else: cfg = ds.config else: cfg = None from datalad.ui import ui from datalad.support.external_versions import external_versions infos = {} res = get_status_dict( action='wtf', path=ds.path if ds else op.abspath(op.curdir), type='dataset' if ds else 'directory', status='ok', logger=lgr, infos=infos, ) infos['datalad'] = _describe_datalad() infos['git-annex'] = _describe_annex() infos['system'] = _describe_system() infos['environment'] = _describe_environment() infos['configuration'] = _describe_configuration(cfg, sensitive) infos['extentions'] = _describe_extensions() infos['metadata_extractors'] = _describe_metadata_extractors() infos['dependencies'] = _describe_dependencies() if ds: infos['dataset'] = _describe_dataset(ds, sensitive) if clipboard: external_versions.check( 'pyperclip', msg="It is needed to be able to use clipboard") import pyperclip report = _render_report(res) pyperclip.copy(report) ui.message("WTF information of length %s copied to clipboard" % len(report)) yield res return
def _remove_remote(ds, repo, name, res_kwargs, **unused_kwargs): if not name: # TODO we could do ALL instead, but that sounds dangerous raise InsufficientArgumentsError("no sibling name given") result_props = dict(action='remove-sibling', path=ds.path, type='sibling', name=name, **res_kwargs) try: # failure can happen and is OK repo.remove_remote(name) except RemoteNotAvailableError as e: yield get_status_dict( # result-oriented! given remote is absent already status='notneeded', **result_props) return yield get_status_dict(status='ok', **result_props)
def _add_remote(ds, name, known_remotes, url, pushurl, fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): # TODO: allow for no url if 'inherit' and deduce from the super ds # create-sibling already does it -- generalize/use # Actually we could even inherit/deduce name from the super by checking # which remote it is actively tracking in current branch... but may be # would be too much magic # it seems that the only difference is that `add` should fail if a remote # already exists if (url is None and pushurl is None): raise InsufficientArgumentsError( """insufficient information to add a sibling (needs at least a dataset, and any URL).""") if url is None: url = pushurl if not name: urlri = RI(url) # use the hostname as default remote name name = urlri.hostname lgr.debug( "No sibling name given, use URL hostname '%s' as sibling name", name) if not name: raise InsufficientArgumentsError("no sibling name given") if name in known_remotes: yield get_status_dict( action='add-sibling', status='error', path=ds.path, type='sibling', name=name, message=("sibling is already known: %s, use `configure` instead?", name), **res_kwargs) return # this remote is fresh: make it known # just minimalistic name and URL, the rest is coming from `configure` ds.repo.add_remote(name, url) known_remotes.append(name) # always copy signature from above to avoid bugs for r in _configure_remote(ds, name, known_remotes, url, pushurl, fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): if r['action'] == 'configure-sibling': r['action'] = 'add-sibling' yield r
def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs): """Helper to drop content in datasets. Parameters ---------- ds : Dataset paths : path or list(path) which content to drop check : bool whether to instruct annex to perform minimum copy availability checks noannex_iserror : bool whether calling this function on a pure Git repo results in an 'impossible' or 'notneeded' result. **kwargs additional payload for the result dicts """ if 'action' not in kwargs: kwargs['action'] = 'drop' # always need to make sure that we pass a list # `normalize_paths` decorator will otherwise screw all logic below paths = assure_list(paths) if not hasattr(ds.repo, 'drop'): for p in paths: r = get_status_dict( status='impossible' if noannex_iserror else 'notneeded', path=p if isabs(p) else normpath(opj(ds.path, p)), message="no annex'ed content", **kwargs) r['action'] = 'drop' yield r return opts = ['--force'] if not check else [] respath_by_status = {} for res in ds.repo.drop(paths, options=opts): res = annexjson2result( # annex reports are always about files res, ds, type='file', **kwargs) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res # report on things requested that annex was silent about for r in results_from_annex_noinfo( ds, paths, respath_by_status, dir_fail_msg='could not drop some content in %s %s', noinfo_dir_msg='nothing to drop from %s', noinfo_file_msg="no annex'ed content", **kwargs): r['action'] = 'drop' yield r
def add_urls(rows, ifexists=None, options=None): """Call `git annex addurl` using information in `rows`. """ for row in rows: filename_abs = row["filename_abs"] ds, filename = row["ds"], row["ds_filename"] lgr.debug("Adding metadata to %s in %s", filename, ds.path) if os.path.exists(filename_abs) or os.path.islink(filename_abs): if ifexists == "skip": yield get_status_dict(action="addurls", ds=ds, type="file", path=filename_abs, status="notneeded") continue elif ifexists == "overwrite": lgr.debug("Removing %s", filename_abs) unlink(filename_abs) else: lgr.debug("File %s already exists", filename_abs) try: out_json = ds.repo.add_url_to_file(filename, row["url"], batch=True, options=options) except AnnexBatchCommandError as exc: yield get_status_dict(action="addurls", ds=ds, type="file", path=filename_abs, message=exc_str(exc), status="error") continue # In the case of an error, the json object has file=None. if out_json["file"] is None: out_json["file"] = filename_abs yield annexjson2result(out_json, ds, action="addurls", type="file", logger=lgr)
def _recursive_install_subds_underneath(ds, recursion_limit, reckless, start=None, refds_path=None, description=None): if isinstance(recursion_limit, int) and recursion_limit <= 0: return # install using helper that give some flexibility regarding where to # get the module from for sub in ds.subdatasets(path=start, return_type='generator', result_renderer='disabled'): subds = Dataset(sub['path']) if sub.get('gitmodule_datalad-recursiveinstall', '') == 'skip': lgr.debug( "subdataset %s is configured to be skipped on recursive installation", sub['path']) continue if sub.get('state', None) != 'absent': # dataset was already found to exist yield get_status_dict('install', ds=subds, status='notneeded', logger=lgr, refds=refds_path) # do not continue, even if an intermediate dataset exists it # does not imply that everything below it does too else: # try to get this dataset for res in _install_subds_from_flexible_source( ds, sub, reckless=reckless, description=description): # yield everything to let the caller decide how to deal with # errors yield res if not subds.is_installed(): # an error result was emitted, and the external consumer can decide # what to do with it, but there is no point in recursing into # something that should be there, but isn't lgr.debug('Subdataset %s could not be installed, skipped', subds) continue # recurse # we can skip the start expression, we know we are within for res in _recursive_install_subds_underneath( subds, recursion_limit=recursion_limit - 1 if isinstance(recursion_limit, int) else recursion_limit, reckless=reckless, refds_path=refds_path): yield res
def _run_extractor(extractor_cls, name, ds, refcommit, status, process_type): """Helper to control extractor using the right API Central switch to deal with alternative/future APIs is inside """ try: # detect supported API and interface as needed if issubclass(extractor_cls, MetadataExtractor): # new-style, command-like extractors extractor = extractor_cls() for r in extractor(dataset=ds, refcommit=refcommit, status=status, process_type=process_type): yield r elif hasattr(extractor_cls, 'get_metadata'): # pragma: no cover # old-style, keep around for a while, but don't sweat over it much for res in _yield_res_from_pre2019_extractor( ds, name, extractor_cls, process_type, # old extractors only take a list of relative paths # and cannot benefit from outside knowledge # TODO avoid is_installed() call [ text_type(Path(p['path']).relative_to(ds.pathobj)) if ds.is_installed() else p['path'] for p in status ]): yield res else: # pragma: no cover raise RuntimeError( '{} does not have a recognised extractor API'.format( extractor_cls)) except Exception as e: # pragma: no cover if cfg.get('datalad.runtime.raiseonerror'): log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', name, ds, ) raise yield get_status_dict( ds=ds, # any errors will have been reported before status='error', message=('Failed to get %s metadata (%s): %s', ds, name, exc_str(e)), )
def __call__(types, files=None, dataset=None): dataset = require_dataset(dataset or curdir, purpose="extract metadata", check_installed=not files) if not files: ds = require_dataset(dataset, check_installed=True) subds = ds.subdatasets(recursive=False, result_xfm='relpaths') files = list(_get_metadatarelevant_paths(ds, subds)) dsmeta, contentmeta, error = _get_metadata( dataset, types, global_meta=True, content_meta=bool(files), paths=files) if dataset is not None and dataset.is_installed(): res = get_status_dict( action='metadata', ds=dataset, refds=dataset.path, metadata=dsmeta, status='error' if error else 'ok') yield res for p in contentmeta: res = get_status_dict( action='metadata', path=opj(dataset.path, p) if dataset else p, refds=dataset.path, metadata=contentmeta[p], type='file', status='error' if error else 'ok') if dataset: res['parentds'] = dataset.path yield res
def _revs_as_results(dset, revs): for rev in revs: res = get_status_dict("run", ds=dset, commit=rev) full_msg = dset.repo.repo.git.show(rev, "--format=%B", "--no-patch") try: msg, info = get_run_info(dset, full_msg) except ValueError as exc: # Recast the error so the message includes the revision. raise ValueError("Error on {}'s message: {}".format( rev, exc_str(exc))) if info is not None: res["run_info"] = info res["run_message"] = msg yield dict(res, status="ok")
def _revs_as_results(dset, revs): for rev in revs: res = get_status_dict("run", ds=dset, commit=rev) full_msg = dset.repo.format_commit("%B", rev) try: msg, info = get_run_info(dset, full_msg) except ValueError as exc: # Recast the error so the message includes the revision. raise ValueError( "Error on {}'s message: {}".format(rev, exc_str(exc))) if info is not None: res["run_info"] = info res["run_message"] = msg yield dict(res, status="ok")
def __call__(): # commands should be implemented as generators and should # report any results by yielding status dictionaries yield get_status_dict( # an action label must be defined, the command name make a good # default action='fusefs', # most results will be about something associated with a dataset # (component), reported paths MUST be absolute path=abspath(curdir), # status labels are used to identify how a result will be reported # and can be used for filtering status='ok', # arbitrary result message, can be a str or tuple. in the latter # case string expansion with arguments is delayed until the # message actually needs to be rendered (analog to exception messages) message=msg)
def __call__(name, dataset=None, remove_image=False): ds = require_dataset(dataset, check_installed=True, purpose='remove a container') res = get_status_dict( ds=ds, action='containers_remove', logger=lgr) section = 'datalad.containers.{}'.format(name) imagecfg = '{}.image'.format(section) to_save = [] if remove_image and imagecfg in ds.config: imagepath = ds.config.get(imagecfg) if op.lexists(op.join(ds.path, imagepath)): for r in ds.remove( path=imagepath, # XXX shortcomming: this is the only way to say: # don't drop check=False, # config setting might be outdated and image no longer # there -> no reason to fail, just report on_failure='ignore', save=False): yield r to_save.append(imagepath) if section in ds.config.sections(): ds.config.remove_section( section, where='dataset', reload=True) res['status'] = 'ok' to_save.append(op.join('.datalad', 'config')) else: res['status'] = 'notneeded' if to_save: for r in ds.save( path=to_save, message='[DATALAD] Remove container {}'.format(name)): yield r yield res
def _query_aggregated_metadata_singlepath( ds, agginfos, agg_base_path, qap, reporton, cache, dsmeta, contentinfo_objloc): """This is the workhorse of query_aggregated_metadata() for querying for a single path""" rpath = qap['rpath'] containing_ds = qap['metaprovider'] qtype = qap.get('type', None) if (rpath == op.curdir or rpath == containing_ds) and \ ((reporton is None and qtype == 'dataset') or \ reporton in ('datasets', 'all')): # this is a direct match for a dataset (we only have agginfos for # datasets) -> prep result res = get_status_dict( status='ok', metadata=dsmeta, # normpath to avoid trailing dot path=op.normpath(op.join(ds.path, rpath)), type='dataset') # all info on the dataset is gathered -> eject yield res if (reporton is None and qtype != 'file') or reporton not in (None, 'files', 'all'): return # # everything that follows is about content metadata # # content info dicts have metadata stored under paths that are relative # to the dataset they were aggregated from rparentpath = op.relpath(rpath, start=containing_ds) # so we have some files to query, and we also have some content metadata contentmeta = _load_xz_json_stream( op.join(agg_base_path, contentinfo_objloc), cache=cache['objcache']) if contentinfo_objloc else {} for fpath in [f for f in contentmeta.keys() if rparentpath == op.curdir or path_startswith(f, rparentpath)]: # we might be onto something here, prepare result metadata = contentmeta.get(fpath, {}) # we have to pull out the context for each extractor from the dataset # metadata for tlk in metadata: if tlk.startswith('@'): continue context = dsmeta.get(tlk, {}).get('@context', None) if context is None: continue metadata[tlk]['@context'] = context if '@context' in dsmeta: metadata['@context'] = dsmeta['@context'] res = get_status_dict( status='ok', # the specific match within the containing dataset # normpath() because containing_ds could be `op.curdir` path=op.normpath(op.join(ds.path, containing_ds, fpath)), # we can only match files type='file', metadata=metadata) yield res
def __call__( path=None, dataset=None, get_aggregates=False, reporton='all', recursive=False): # prep results refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='metadata', logger=lgr) if refds_path: res_kwargs['refds'] = refds_path if get_aggregates: # yield all datasets for which we have aggregated metadata as results # the get actual dataset results, so we can turn them into dataset # instances using generic top-level code if desired ds = require_dataset( refds_path, check_installed=True, purpose='aggregate metadata query') agginfos = load_ds_aggregate_db( ds, version=str(aggregate_layout_version), abspath=True ) if not agginfos: # if there has ever been an aggregation run, this file would # exist, hence there has not been and we need to tell this # to people yield get_status_dict( ds=ds, status='impossible', action='metadata', logger=lgr, message='metadata aggregation has never been performed in this dataset') return parentds = [] for dspath in sorted(agginfos): info = agginfos[dspath] if parentds and not path_is_subpath(dspath, parentds[-1]): parentds.pop() info.update( path=dspath, type='dataset', status='ok', ) if dspath == ds.path: info['layout_version'] = aggregate_layout_version if parentds: info['parentds'] = parentds[-1] yield dict( info, **res_kwargs ) parentds.append(dspath) return if not dataset and not path: # makes no sense to have no dataset, go with "here" # error generation happens during annotation path = op.curdir content_by_ds = OrderedDict() for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, # MIH: we are querying the aggregated metadata anyways, and that # mechanism has its own, faster way to go down the hierarchy #recursive=recursive, #recursion_limit=recursion_limit, action='metadata', # uninstalled subdatasets could be queried via aggregated metadata # -> no 'error' unavailable_path_status='', nondataset_path_status='error', # we need to know when to look into aggregated data force_subds_discovery=True, force_parentds_discovery=True, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo(ap['path']): ap['process_content'] = True to_query = None if ap.get('state', None) == 'absent' or \ ap.get('type', 'dataset') != 'dataset': # this is a lonely absent dataset/file or content in a present dataset # -> query through parent # there must be a parent, otherwise this would be a non-dataset path # and would have errored during annotation to_query = ap['parentds'] else: to_query = ap['path'] if to_query: pcontent = content_by_ds.get(to_query, []) pcontent.append(ap) content_by_ds[to_query] = pcontent for ds_path in content_by_ds: ds = Dataset(ds_path) query_agg = [ap for ap in content_by_ds[ds_path] # this is an available subdataset, will be processed in another # iteration if ap.get('state', None) == 'absent' or not(ap.get('type', None) == 'dataset' and ap['path'] != ds_path)] if not query_agg: continue # report from aggregated metadata for r in query_aggregated_metadata( reporton, # by default query the reference dataset, only if there is none # try our luck in the dataset that contains the queried path # this is consistent with e.g. `get_aggregates` reporting the # situation in the reference dataset only Dataset(refds_path) if refds_path else ds, query_agg, # recursion above could only recurse into datasets # on the filesystem, but there might be any number of # uninstalled datasets underneath the last installed one # for which we might have metadata recursive=recursive, **res_kwargs): yield r return
def __call__( spec=None, dataset=None, discover=False, help_proc=False): if not spec and not discover: raise InsufficientArgumentsError('requires at least a procedure name') if help_proc and not spec: raise InsufficientArgumentsError('requires a procedure name') try: ds = require_dataset( dataset, check_installed=False, purpose='run a procedure') except NoDatasetArgumentFound: ds = None if discover: reported = set() for m, cmd_name, cmd_tmpl, cmd_help in \ _get_procedure_implementation('*', ds=ds): if m in reported: continue ex = _guess_exec(m) # configured template (call-format string) takes precedence: if cmd_tmpl: ex['template'] = cmd_tmpl if ex['type'] is None and ex['template'] is None: # doesn't seem like a match lgr.debug("Neither type nor execution template found for " "%s. Ignored.", m) continue message = ex['type'] if ex['type'] else 'unknown type' message += ' (missing)' if ex['state'] == 'absent' else '' res = get_status_dict( action='discover_procedure', path=m, type='file', logger=lgr, refds=ds.path if ds else None, status='ok', state=ex['state'], procedure_name=cmd_name, procedure_type=ex['type'], procedure_callfmt=ex['template'], procedure_help=cmd_help, message=message) reported.add(m) yield res return if not isinstance(spec, (tuple, list)): # maybe coming from config import shlex spec = shlex.split(spec) name = spec[0] args = spec[1:] try: # get the first match an run with it procedure_file, cmd_name, cmd_tmpl, cmd_help = \ next(_get_procedure_implementation(name, ds=ds)) except StopIteration: res = get_status_dict( action='run_procedure', # TODO: Default renderer requires a key "path" to exist. # Doesn't make a lot of sense in this case path=name, logger=lgr, refds=ds.path if ds else None, status='impossible', message="Cannot find procedure with name '%s'" % name) yield res return ex = _guess_exec(procedure_file) # configured template (call-format string) takes precedence: if cmd_tmpl: ex['template'] = cmd_tmpl if help_proc: if cmd_help: res = get_status_dict( action='procedure_help', path=procedure_file, type='file', logger=lgr, refds=ds.path if ds else None, status='ok', state=ex['state'], procedure_name=cmd_name, procedure_type=ex['type'], procedure_callfmt=ex['template'], message=cmd_help) else: res = get_status_dict( action='procedure_help', path=procedure_file, type='file', logger=lgr, refds=ds.path if ds else None, status='impossible', state=ex['state'], procedure_name=cmd_name, procedure_type=ex['type'], procedure_callfmt=ex['template'], message="No help available for '%s'" % name) yield res return if not ex['template']: raise ValueError("No idea how to execute procedure %s. " "Missing 'execute' permissions?" % procedure_file) cmd = ex['template'].format( script=procedure_file, ds=ds.path if ds else '', args=u' '.join(u'"{}"'.format(a) for a in args) if args else '') lgr.info("Running procedure %s", name) lgr.debug('Full procedure command: %r', cmd) for r in Run.__call__( cmd=cmd, dataset=ds, explicit=True, inputs=None, outputs=None, # pass through here on_failure='ignore', return_type='generator' ): yield r
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)") # prep common result props res_kwargs = dict( action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive( refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = assure_list(path) if modified is not None: # modification detection would silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = resolve_path(p, ds=refds_path) if path_startswith(p, refds_path): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[r] = res yield res # preserve non-existing paths to be silently killed by modification # detection and append them to requested_paths again after detection. # TODO: This might be melted in with treatment of non dataset paths # above. Re-appending those paths seems to be better than yielding # directly to avoid code duplication, since both cases later on are # dealt with again. preserved_paths = [] if requested_paths: [preserved_paths.append(r) for r in requested_paths if not lexists(r['path'] if isinstance(r, dict) else r)] # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) from itertools import chain # re-append the preserved paths: requested_paths = chain(requested_paths, iter(preserved_paths)) # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path if not islink(path) else normpath(opj(path, pardir)) else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or ( refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root(normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not path_startswith(dspath, refds_path): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) subdss = containing_ds.subdatasets( fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get( 'status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset(parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return
def _publish_data(ds, remote, paths, annex_copy_options, force, transfer_data, **kwargs): # paths are annotated paths for now, changes below if not isinstance(ds.repo, AnnexRepo): # impossible to publish annex'ed data return if ds.config.getbool('remote.{}'.format(remote), 'annex-ignore', False): # configuration says: don't do it return if not ds.config.get('.'.join(('remote', remote, 'annex-uuid')), None): # this remote either isn't an annex, or hasn't been properly initialized for ap in paths: # this is only a problem if this path ap['status'] = 'impossible' \ if transfer_data == 'all' or ap.get('raw_input', False) \ else 'notneeded' ap['message'] = \ ("annex for remote '%s' not available, or not properly configured", remote) yield ap return # what data to transfer? if transfer_data == 'all': paths = ['.'] elif transfer_data == 'auto': # keep only paths that were requested and are not the base path of the dataset # if the resulting list is empty, the "auto" mode of _publish_data() will # kick in and consult "wanted" paths = [p['path'] for p in paths if p.get('raw_input', False) and not p['path'] == ds.path] else: raise ValueError( "unknown label '{}' for `transfer_data` option".format( transfer_data)) # TODO do we really have to call annex for that, or can we take it from # the config instead? remote_wanted = ds.repo.get_preferred_content('wanted', remote) if not (paths or annex_copy_options or remote_wanted): # nothing that we could tell git annex return # we should now know what needs doing lgr.info("Publishing {0} data to {1}".format(ds, remote)) # overwrite URL with pushurl if any, reason: # https://git-annex.branchable.com/bugs/annex_ignores_pushurl_and_uses_only_url_upon___34__copy_--to__34__/ # Note: This shouldn't happen anymore with newly added siblings. # But for now check for it, until we agree on how to fix existing # ones. pushurl = ds.config.get('remote.{}.pushurl'.format(remote), None) annexurl = ds.config.get('remote.{}.annexurl'.format(remote), None) annex_copy_options_ = annex_copy_options or '' if pushurl and not annexurl: annex_copy_options_ += ' -c "remote.{}.annexurl={}"'.format(remote, pushurl) if not paths and remote_wanted: lgr.debug("Invoking copy --auto") annex_copy_options_ += ' --auto' # TODO: we might need additional logic comparing the state of git-annex # branch locally and on remote to see if information about the 'copy' # was also reflected on the remote end #git_annex_hexsha = ds.repo.get_hexsha('git-annex') # TODO: must be the same if we merged/pushed before, if not -- skip # special logic may be with a warning if not force: # if we force, we do not trust local knowledge and do the checks annex_copy_options_ += ' --fast' # TODO this things needs to return JSON ncopied = 0 for r in ds.repo.copy_to( files=[p for p in paths # TODO we may have to check for any file in Git, but this one can # easily happen with --since if not p == opj(ds.path, '.gitmodules')], remote=remote, options=annex_copy_options_): ncopied += 1 # TODO RF to have copy_to() yield JSON and convert that one # at present only the "good" results come out yield get_status_dict(status='ok', path=opj(ds.path, r), type='file', parentds=ds.path, **kwargs) if ncopied: _check_and_update_remote_server_info(ds, remote)
def __call__( source, path=None, dataset=None, description=None, reckless=False, alt_sources=None): # TODO next ones should be there, but cannot go anywhere # git_opts=None, # git_clone_opts=None, # annex_opts=None, # annex_init_opts=None # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. dataset = require_dataset( dataset, check_installed=True, purpose='cloning') \ if dataset is not None else dataset refds_path = dataset.path if dataset else None if isinstance(source, Dataset): source = source.path if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "clone `source` and destination `path` are identical [{}]. " "If you are trying to add a subdataset simply use `add`".format( path)) if path is not None: path = resolve_path(path, dataset) # Possibly do conversion from source into a git-friendly url # luckily GitRepo will undo any fancy file:/// url to make use of Git's # optimization for local clones.... source_url = source source_ = _get_git_url_from_source(source) lgr.debug("Resolved clone source from '%s' to '%s'", source, source_) source = source_ # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue path = _get_installationpath_from_url(source) # since this is a relative `path`, resolve it: path = resolve_path(path, dataset) lgr.debug("Determined clone target path from source") lgr.debug("Resolved clone target path to: '%s'", path) # there is no other way -- my intoxicated brain tells me assert(path is not None) destination_dataset = Dataset(path) dest_path = path status_kwargs = dict( action='install', ds=destination_dataset, logger=lgr, refds=refds_path, source_url=source_url) # important test! based on this `rmtree` will happen below after failed clone if exists(dest_path) and listdir(dest_path): if destination_dataset.is_installed(): # check if dest was cloned from the given source before # this is where we would have installed this from guessed_sources = _get_flexible_source_candidates( source, dest_path) # this is where it was actually installed from track_name, track_url = _get_tracking_source(destination_dataset) if track_url in guessed_sources or \ get_local_file_url(track_url) in guessed_sources: yield get_status_dict( status='notneeded', message=("dataset %s was already cloned from '%s'", destination_dataset, source), **status_kwargs) return # anything else is an error yield get_status_dict( status='error', message='target path already exists and not empty, refuse to clone into target path', **status_kwargs) return if dataset is not None and relpath(path, start=dataset.path).startswith(pardir): yield get_status_dict( status='error', message=("clone target path '%s' not in specified target dataset '%s'", path, dataset), **status_kwargs) return # generate candidate URLs from source argument to overcome a few corner cases # and hopefully be more robust than git clone candidate_sources = [] # combine all given sources (incl. alternatives), maintain order for s in [source] + assure_list(alt_sources): candidate_sources.extend(_get_flexible_source_candidates(s)) candidates_str = \ " [%d other candidates]" % (len(candidate_sources) - 1) \ if len(candidate_sources) > 1 \ else '' lgr.info("Cloning %s%s into '%s'", source, candidates_str, dest_path) dest_path_existed = exists(dest_path) error_msgs = OrderedDict() # accumulate all error messages formatted per each url for isource_, source_ in enumerate(candidate_sources): try: lgr.debug("Attempting to clone %s (%d out of %d candidates) to '%s'", source_, isource_ + 1, len(candidate_sources), dest_path) GitRepo.clone(path=dest_path, url=source_, create=True) break # do not bother with other sources if succeeded except GitCommandError as e: error_msgs[source_] = exc_str_ = exc_str(e) lgr.debug("Failed to clone from URL: %s (%s)", source_, exc_str_) if exists(dest_path): lgr.debug("Wiping out unsuccessful clone attempt at: %s", dest_path) # We must not just rmtree since it might be curdir etc # we should remove all files/directories under it rmtree(dest_path, children_only=dest_path_existed) # Whenever progress reporting is enabled, as it is now, # we end up without e.stderr since it is "processed" out by # GitPython/our progress handler. e_stderr = e.stderr from datalad.support.gitrepo import GitPythonProgressBar if not e_stderr and GitPythonProgressBar._last_error_lines: e_stderr = os.linesep.join(GitPythonProgressBar._last_error_lines) if 'could not create work tree' in e_stderr.lower(): # this cannot be fixed by trying another URL re_match = re.match(r".*fatal: (.*)$", e_stderr, flags=re.MULTILINE | re.DOTALL) yield get_status_dict( status='error', message=re_match.group(1) if re_match else "stderr: " + e_stderr, **status_kwargs) return if not destination_dataset.is_installed(): if len(error_msgs): error_msg = "Failed to clone from any candidate source URL. " \ "Encountered errors per each url were: %s" error_args = (error_msgs, ) else: # yoh: Not sure if we ever get here but I felt that there could # be a case when this might happen and original error would # not be sufficient to troubleshoot what is going on. error_msg = "Awkward error -- we failed to clone properly. " \ "Although no errors were encountered, target " \ "dataset at %s seems to be not fully installed. " \ "The 'succesful' source was: %s" error_args = (destination_dataset.path, source_) yield get_status_dict( status='error', message=(error_msg, error_args), **status_kwargs) return if dataset is not None: # we created a dataset in another dataset # -> make submodule for r in dataset.save( dest_path, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r _handle_possible_annex_dataset( destination_dataset, reckless, description=description) # yield successful clone of the base dataset now, as any possible # subdataset clone down below will not alter the Git-state of the # parent yield get_status_dict(status='ok', **status_kwargs)
def _get_submodules(dspath, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): if not GitRepo.is_valid_repo(dspath): return modinfo = _parse_gitmodules(dspath) # write access parser parser = None # TODO bring back in more global scope from below once segfaults are # figured out #if set_property or delete_property: # gitmodule_path = opj(dspath, ".gitmodules") # parser = GitConfigParser( # gitmodule_path, read_only=False, merge_includes=False) # parser.read() # put in giant for-loop to be able to yield results before completion for sm in _parse_git_submodules(dspath): if contains and not path_startswith(contains, sm['path']): # we are not looking for this subds, because it doesn't # match the target path continue sm.update(modinfo.get(sm['path'], {})) if set_property or delete_property: gitmodule_path = opj(dspath, ".gitmodules") parser = GitConfigParser( gitmodule_path, read_only=False, merge_includes=False) parser.read() # do modifications now before we read the info out for reporting # use 'submodule "NAME"' section ID style as this seems to be the default submodule_section = 'submodule "{}"'.format(sm['gitmodule_name']) # first deletions for dprop in assure_list(delete_property): parser.remove_option(submodule_section, dprop) # also kick from the info we just read above sm.pop('gitmodule_{}'.format(dprop), None) # and now setting values for sprop in assure_list(set_property): prop, val = sprop if val.startswith('<') and val.endswith('>') and '{' in val: # expand template string val = val[1:-1].format( **dict( sm, refds_relpath=relpath(sm['path'], refds_path), refds_relname=relpath(sm['path'], refds_path).replace(os.sep, '-'))) parser.set_value( submodule_section, prop, val) # also add to the info we just read above sm['gitmodule_{}'.format(prop)] = val Dataset(dspath).add( '.gitmodules', to_git=True, message='[DATALAD] modified subdataset properties') # let go of resources, locks, ... parser.release() #common = commonprefix((with_pathsep(subds), with_pathsep(path))) #if common.endswith(sep) and common == with_pathsep(subds): # candidates.append(common) subdsres = get_status_dict( 'subdataset', status='ok', type='dataset', logger=lgr) subdsres.update(sm) subdsres['parentds'] = dspath if not bottomup and \ (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled): yield subdsres # expand list with child submodules. keep all paths relative to parent # and convert jointly at the end if recursive and \ (recursion_limit in (None, 'existing') or (isinstance(recursion_limit, int) and recursion_limit > 1)): for r in _get_submodules( sm['path'], fulfilled, recursive, (recursion_limit - 1) if isinstance(recursion_limit, int) else recursion_limit, contains, bottomup, set_property, delete_property, refds_path): yield r if bottomup and \ (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled): yield subdsres if parser is not None: # release parser lock manually, auto-cleanup is not reliable in PY3 parser.release()
def __call__( revision="HEAD", since=None, dataset=None, branch=None, message=None, onto=None, script=None, report=False): ds = require_dataset( dataset, check_installed=True, purpose='rerunning a command') lgr.debug('rerunning command output underneath %s', ds) if script is None and not report and ds.repo.dirty: yield get_status_dict( 'run', ds=ds, status='impossible', message=( 'clean dataset required to detect changes from command; ' 'use `datalad status` to inspect unsaved changes')) return if not ds.repo.get_hexsha(): yield get_status_dict( 'run', ds=ds, status='impossible', message='cannot rerun command, nothing recorded') return if branch and branch in ds.repo.get_branches(): yield get_status_dict( "run", ds=ds, status="error", message="branch '{}' already exists".format(branch)) return if not ds.repo.commit_exists(revision + "^"): # Only a single commit is reachable from `revision`. In # this case, --since has no effect on the range construction. revrange = revision elif since is None: revrange = "{rev}^..{rev}".format(rev=revision) elif since.strip() == "": revrange = revision else: revrange = "{}..{}".format(since, revision) if ds.repo.repo.git.rev_list("--merges", revrange, "--"): yield get_status_dict( "run", ds=ds, status="error", message="cannot rerun history with merge commits") return results = _rerun_as_results(ds, revrange, since, branch, onto, message) if script: handler = _get_script_handler(script, since, revision) elif report: handler = _report else: handler = _rerun for res in handler(ds, results): yield res
def _rerun_as_results(dset, revrange, since, branch, onto, message): """Represent the rerun as result records. In the standard case, the information in these results will be used to actually re-execute the commands. """ revs = dset.repo.repo.git.rev_list("--reverse", revrange, "--").split() try: results = _revs_as_results(dset, revs) except ValueError as exc: yield get_status_dict("run", status="error", message=exc_str(exc)) return if since is not None and since.strip() == "": # For --since='', drop any leading commits that don't have # a run command. results = list(dropwhile(lambda r: "run_info" not in r, results)) if not results: yield get_status_dict( "run", status="impossible", ds=dset, message=("No run commits found in history of %s", revrange)) return else: results = list(results) if not results: yield get_status_dict( "run", status="impossible", ds=dset, message=("No commits found in %s", revrange)) return if onto is not None and onto.strip() == "": # Special case: --onto='' is the value of --since. Because we're # currently aborting if the revision list contains merges, we know # that, regardless of if and how --since is specified, the effective # value for --since is the parent of the first revision. onto = results[0]["commit"] + "^" if onto and not dset.repo.commit_exists(onto): # This happens either because the user specifies a value that doesn't # exists or the results first parent doesn't exist. The latter is # unlikely to happen in the wild because it means that the first commit # is a datalad run commit. Just abort rather than trying to checkout an # orphan branch or something like that. yield get_status_dict( "run", ds=dset, status="error", message=("Revision specified for --onto (%s) does not exist.", onto)) return start_point = onto or "HEAD" if branch or onto: yield get_status_dict( "run", ds=dset, commit=start_point, branch=branch, rerun_action="checkout", status="ok") def rev_is_ancestor(rev): return dset.repo.is_ancestor(rev, start_point) # We want to skip revs before the starting point and pick those after. to_pick = set(dropwhile(rev_is_ancestor, [r["commit"] for r in results])) def skip_or_pick(hexsha, result, msg): pick = hexsha in to_pick result["rerun_action"] = "pick" if pick else "skip" shortrev = dset.repo.get_hexsha(hexsha, short=True) result["message"] = ( "%s %s; %s", shortrev, msg, "cherry picking" if pick else "skipping") for res in results: hexsha = res["commit"] if "run_info" in res: rerun_dsid = res["run_info"].get("dsid") if rerun_dsid is not None and rerun_dsid != dset.id: skip_or_pick(hexsha, res, "was ran from a different dataset") res["status"] = "impossible" else: res["rerun_action"] = "run" res["diff"] = diff_revision(dset, hexsha) # This is the overriding message, if any, passed to this rerun. res["rerun_message"] = message else: skip_or_pick(hexsha, res, "does not have a command") yield res
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None): if path is None and dataset is None: raise InsufficientArgumentsError( "insufficient arguments for unlocking: needs at least " "a dataset or a path to unlock.") refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='unlock', logger=lgr, refds=refds_path) to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='unlock', unavailable_path_status='impossible', unavailable_path_msg="path does not exist", nondataset_path_status='impossible', modified=None, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', 'dataset') == 'dataset': # this is a dataset ap['process_content'] = True to_process.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert(not completed) for ds_path in sorted(content_by_ds.keys()): ds = Dataset(ds_path) content = content_by_ds[ds_path] # no annex, no unlock: if not isinstance(ds.repo, AnnexRepo): for ap in content: ap['status'] = 'notneeded' ap['message'] = "not annex'ed, nothing to unlock" ap.update(res_kwargs) yield ap continue # direct mode, no unlock: elif ds.repo.is_direct_mode(): for ap in content: ap['status'] = 'notneeded' ap['message'] = "direct mode, nothing to unlock" ap.update(res_kwargs) yield ap continue # only files in annex with their content present: files = [ap['path'] for ap in content] to_unlock = [] for ap, under_annex, has_content in \ zip(content, ds.repo.is_under_annex(files), ds.repo.file_has_content(files)): # TODO: what about directories? Make sure, there is no # situation like no file beneath with content or everything in # git, that leads to a CommandError # For now pass to annex: from os.path import isdir if isdir(ap['path']): to_unlock.append(ap) continue # Note, that `file_has_content` is (planned to report) True on # files in git. Therefore order matters: First check for annex! if under_annex: if has_content: to_unlock.append(ap) # no content, no unlock: else: ap['status'] = 'impossible' ap['message'] = "no content present, can't unlock" ap.update(res_kwargs) yield ap # file in git, no unlock: else: ap['status'] = 'notneeded' ap['message'] = "not controlled by annex, nothing to unlock" ap.update(res_kwargs) yield ap # don't call annex-unlock with no path, if this is this case because # nothing survived the filtering above if content and not to_unlock: continue for r in ds.repo.unlock([ap['path'] for ap in to_unlock]): yield get_status_dict( path=opj(ds.path, r), status='ok', type='file', **res_kwargs)
def __call__( path=None, dataset=None, to=None, since=None, missing='fail', force=False, transfer_data='auto', recursive=False, recursion_limit=None, git_opts=None, annex_opts=None, annex_copy_opts=None, jobs=None ): # if ever we get a mode, for "with-data" we would need this #if dataset and not path: # # act on the whole dataset if nothing else was specified # path = dataset.path if isinstance(dataset, Dataset) else dataset if not dataset and not path: # try to find a dataset in PWD dataset = require_dataset( None, check_installed=True, purpose='publishing') if since and not dataset: raise InsufficientArgumentsError( 'Modification detection (--since) without a base dataset ' 'is not supported') if dataset and since == '': # only update since last update so we figure out what was the last update active_branch = dataset.repo.get_active_branch() if to: # XXX here we assume one to one mapping of names from local branches # to the remote since = '%s/%s' % (to, active_branch) else: # take tracking remote for the active branch tracked_remote, tracked_refspec = dataset.repo.get_tracking_branch() if tracked_remote: if tracked_refspec.startswith('refs/heads/'): tracked_refspec = tracked_refspec[len('refs/heads/'):] #to = tracked_remote since = '%s/%s' % (tracked_remote, tracked_refspec) else: lgr.info( "No tracked remote for %s. since option is of no effect", active_branch ) since = None # here is the plan # 1. figure out remote to publish to # 2. figure out which content needs to be published to this remote # 3. look for any pre-publication dependencies of that remote # (i.e. remotes that need to be published to before) # 4. publish the content needed to go to the primary remote to # the dependencies first, and to the primary afterwards ds_remote_info = {} refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(refds=refds_path, logger=lgr, action='publish') to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='publish', unavailable_path_status='impossible', nondataset_path_status='error', modified=since, return_type='generator', on_failure='ignore', force_no_revision_change_discovery=False, # we cannot publish what was not committed force_untracked_discovery=False # we cannot publish untracked ): if ap.get('status', None): # this is done yield ap continue remote_info_result = None if ap.get('type', ap.get('type_src', 'dataset')) != 'dataset': # for everything that is not a dataset get the remote info # for the parent parentds = ap.get('parentds', None) if parentds and parentds not in ds_remote_info: remote_info_result = _get_remote_info( parentds, ds_remote_info, to, missing) else: # this is a dataset if ap.get('state', None) == 'absent': continue # get the remote info for itself remote_info_result = _get_remote_info( ap['path'], ds_remote_info, to, missing) ap['process_content'] = True if remote_info_result is not None: ap['status'] = remote_info_result[0] ap['message'] = remote_info_result[1] yield ap continue to_process.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert(not completed) lgr.debug( "Evaluating %i dataset publication candidate(s)", len(content_by_ds)) # TODO: fancier sorting, so we still follow somewhat the hierarchy # in sorted order, e.g. # d1/sub1/sub1 # d1/sub1 # d1 # d2/sub1 # d2 content_by_ds = OrderedDict( (d, content_by_ds[d]) for d in sorted(content_by_ds, reverse=True) ) lgr.debug("Attempt to publish %i datasets", len(content_by_ds)) for ds_path in content_by_ds: remote_info = ds_remote_info.get(ds_path, None) if remote_info is None: # maybe this dataset wasn't annotated above, try to get info # MIH: I think this entire if-branch is practically impossible # to reach. It is certainly untested, but I think this is due # to mutually exclusive conditions during remote_info detection remote_info_result = _get_remote_info( ds_path, ds_remote_info, to, missing) if remote_info_result is not None: yield get_status_dict( type='dataset', path=ds_path, status=remote_info_result[0], message=remote_info_result[1], **res_kwargs) continue # continue with freshly obtained info remote_info = ds_remote_info[ds_path] # condition above must catch all other cases assert remote_info # and publish ds = Dataset(ds_path) for r in _publish_dataset( ds, remote=remote_info['remote'], refspec=remote_info.get('refspec', None), # only send paths that were explicitly requested paths=[p for p in content_by_ds[ds_path] # do not feed (sub)dataset paths into the beast # makes no sense to try to annex copy them # for the base dataset itself let `transfer_data` # decide if p.get('type', None) != 'dataset'], annex_copy_options=annex_copy_opts, force=force, jobs=jobs, transfer_data=transfer_data, **res_kwargs): yield r
def _publish_dataset(ds, remote, refspec, paths, annex_copy_options, force=False, jobs=None, transfer_data='auto', **kwargs): # TODO: this setup is now quite ugly. The only way `refspec` can come # in, is when there is a tracking branch, and we get its state via # `refspec` # define config var name for potential publication dependencies depvar = 'remote.{}.datalad-publish-depends'.format(remote) # list of remotes that are publication dependencies for the # target remote publish_depends = assure_list(ds.config.get(depvar, [])) # remote might be set to be ignored by annex, or we might not even know yet its uuid # make sure we are up-to-date on this topic on all affected remotes, before # we start making decisions for r in publish_depends + [remote]: if not ds.config.get('.'.join(('remote', remote, 'annex-uuid')), None): lgr.debug("Obtain remote annex info from '%s'", r) ds.repo.fetch(remote=r) # in order to be able to use git's config to determine what to push, # we need to annex merge first. Otherwise a git push might be # rejected if involving all matching branches for example. # NOTE we should not use a precomputed 'is_annex' test here, as # each fetch could give evidence that there is an annex # somewhere and replace the repo class... if isinstance(ds.repo, AnnexRepo): ds.repo.merge_annex(r) ds.config.reload() # anything that follows will not change the repo type anymore, cache is_annex_repo = isinstance(ds.repo, AnnexRepo) # Plan: # 1. Check if there is anything to push, and if so # 2. process push dependencies # 3. fetch and merge annex branch # 4. push non-annex branch(es) # 5. copy data to the remote if paths are provided or it wants something generally # upstream refspec needed for update (merge) and subsequent push, # in case there is no. # no tracking refspec yet? # TODO: i think this whole modification detection could be done by path # annotation at the very beginning -- keeping it for now to not get too # dizzy in the forehead.... # if forced -- we push regardless if there are differences or not diff = True if force else has_diff(ds, refspec, remote, paths) # We might have got new information in git-annex branch although no other # changes if not diff and is_annex_repo: try: git_annex_commit = next(ds.repo.get_branch_commits('git-annex')) except StopIteration: git_annex_commit = None #diff = _get_remote_diff(ds, [], git_annex_commit, remote, 'git-annex') diff = _get_remote_diff(ds, git_annex_commit, remote, 'git-annex') if diff: lgr.info("Will publish updated git-annex") # # publish data (annex copy --to) # # # remote might be set to be ignored by annex, or we might not even know yet its uuid # annex_ignore = ds.config.getbool('remote.{}.annex-ignore'.format(remote), None) # annex_uuid = ds.config.get('remote.{}.annex-uuid'.format(remote), None) # if not annex_ignore: # if annex_uuid is None: # # most probably not yet 'known' and might require some annex copied_data = False # skip right away if data transfer is not desired if transfer_data != 'none' and isinstance(ds.repo, AnnexRepo): # publishing of `remote` might depend on publishing other # remote(s) first, so they need to receive the data first: for d in publish_depends: lgr.info("Transferring data to configured publication dependency: '%s'" % d) # properly initialized remote annex -> publish data for r in _publish_data( ds, d, paths, annex_copy_options, force, transfer_data, **kwargs): # note if we published any data, notify to sync annex branch below if r['status'] == 'ok' and r['action'] == 'publish' and \ r.get('type', None) == 'file': copied_data = True yield r # and for the main target for r in _publish_data( ds, remote, paths, annex_copy_options, force, transfer_data, **kwargs): # note if we published any data, notify to sync annex branch below if r['status'] == 'ok' and r['action'] == 'publish' and \ r.get('type', None) == 'file': copied_data = True yield r # # publish dataset (git push) # if not diff and not copied_data: lgr.debug("No changes detected with respect to state of '%s'", remote) yield get_status_dict(ds=ds, status='notneeded', **kwargs) else: # publishing of `remote` might depend on publishing other # remote(s) first: for d in publish_depends: lgr.info("Publishing to configured dependency: '%s'" % d) # call this again to take care of the dependency first, # but keep the paths the same, as the goal is to publish those # to the primary remote, and not anything elase to a dependency for r in _publish_dataset( ds, d, # should get the same as the base dataset refspec, paths, annex_copy_options, force=force, jobs=jobs, transfer_data=transfer_data, **kwargs): yield r if is_annex_repo and \ ds.repo.is_special_annex_remote(remote): # There is nothing else to "publish" lgr.debug( "{0} is a special annex remote, no git push is needed".format(remote) ) return lgr.info("Publishing {0} to {1}".format(ds, remote)) # in order to be able to use git's config to determine what to push, # we need to annex merge first. Otherwise a git push might be # rejected if involving all matching branches for example # even if we already fetched above we need to do it again if is_annex_repo: lgr.debug("Obtain remote annex info from '%s'", remote) ds.repo.fetch(remote=remote) ds.repo.merge_annex(remote) # Note: git's push.default is 'matching', which doesn't work for first # time publication (a branch, that doesn't exist on remote yet) # But if we want to respect remote.*.push entries, etc. we need to # not pass a specific refspec (like active branch) to `git push` # by default. # hence we amend any existing config on the fly # TODO: what else to push by default? # consider also: --follow-tags, --tags, --atomic # make sure we push things2push = [] current_branch = ds.repo.get_active_branch() if current_branch: # possibly make this conditional on a switch # TODO: this should become it own helper if is_annex_repo: # annex could manage this branch if current_branch.startswith('annex/direct') \ and ds.config.getbool('annex', 'direct', default=False): # this is a "fake" annex direct mode branch # we want to publish the underlying branch current_branch = current_branch[12:] match_adjusted = re.match( 'adjusted/(.*)\([a-z]*\)', current_branch) if match_adjusted: # adjusted/master(...) # TODO: this code is not tested # see https://codecov.io/gh/datalad/datalad/src/17e67045a088ae0372b38aa4d8d46ecf7c821cb7/datalad/distribution/publish.py#L156 # and thus probably broken -- test me! current_branch = match_adjusted.group(1) things2push.append(current_branch) if is_annex_repo: things2push.append('git-annex') # check that all our magic found valid branches things2push = [t for t in things2push if t in ds.repo.get_branches()] # check that we don't ask to push things that are already configured # -> would cause error # TODO need to find a way to properly do this, when wildcards are used # in the push configuration variable things2push = [t for t in things2push if t not in ds.config.get('remote.{}.push'.format(remote), [])] # now we know what to push where status, msg = _push(ds, remote, things2push, force) yield get_status_dict(ds=ds, status=status, message=msg, **kwargs)
def query_aggregated_metadata(reporton, ds, aps, recursive=False, **kwargs): """Query the aggregated metadata in a dataset Query paths (`aps`) have to be composed in an intelligent fashion by the caller of this function, i.e. it should have been decided outside which dataset to query for any given path. Also this function doesn't cache anything, hence the caller must make sure to only call this once per dataset to avoid waste. Parameters ---------- reporton : {None, 'none', 'dataset', 'files', 'all'} If `None`, reporting will be based on the `type` property of the incoming annotated paths. ds : Dataset Dataset to query aps : list Sequence of annotated paths to query metadata for. recursive : bool Whether or not to report metadata underneath all query paths recursively. **kwargs Any other argument will be passed on to the query result dictionary. Returns ------- generator Of result dictionaries. """ from datalad.coreapi import get # look for and load the aggregation info for the base dataset agginfos, agg_base_path = load_ds_aggregate_db(ds) # cache once loaded metadata objects for additional lookups # TODO possibly supply this cache from outside, if objects could # be needed again -- their filename does not change in a superdataset # if done, cache under relpath, not abspath key cache = { 'objcache': {}, 'subds_relpaths': None, } reported = set() # for all query paths for ap in aps: # all metadata is registered via its relative path to the # dataset that is being queried rpath = op.relpath(ap['path'], start=ds.path) if rpath in reported: # we already had this, probably via recursion of some kind continue rap = dict(ap, rpath=rpath, type=ap.get('type', None)) # we really have to look this up from the aggregated metadata # and cannot use any 'parentds' property in the incoming annotated # path. the latter will reflect the situation on disk, we need # the record of the containing subdataset in the aggregated metadata # instead containing_ds = _get_containingds_from_agginfo(agginfos, rpath) if containing_ds is None: # could happen if there was no aggregated metadata at all # or the path is in this dataset, but luckily the queried dataset # is known to be present containing_ds = op.curdir rap['metaprovider'] = containing_ds # build list of datasets and paths to be queried for this annotated path # in the simple case this is just the containing dataset and the actual # query path to_query = [rap] if recursive: # in case of recursion this is also anything in any dataset underneath # the query path matching_subds = [{'metaprovider': sub, 'rpath': sub, 'type': 'dataset'} for sub in sorted(agginfos) # we already have the base dataset if (rpath == op.curdir and sub != op.curdir) or path_is_subpath(sub, rpath)] to_query.extend(matching_subds) to_query_available = [] for qap in to_query: if qap['metaprovider'] not in agginfos: res = get_status_dict( status='impossible', path=qap['path'], message=( 'Dataset at %s contains no aggregated metadata on this path', qap['metaprovider']), ) res.update(res, **kwargs) if 'type' in qap: res['type'] = qap['type'] yield res else: to_query_available.append(qap) # one heck of a beast to get the set of filenames for all metadata objects that are # required to be present to fulfill this query objfiles = set( agginfos.get(qap['metaprovider'], {}).get(t, None) for qap in to_query_available for t in ('dataset_info',) + \ (('content_info',) if ((reporton is None and qap.get('type', None) == 'file') or reporton in ('files', 'all')) else tuple()) ) # in case there was no metadata provider, we do not want to start # downloading everything: see https://github.com/datalad/datalad/issues/2458 objfiles.difference_update([None]) lgr.debug( 'Verifying/achieving local availability of %i metadata objects', len(objfiles)) if objfiles: get(path=[dict(path=op.join(agg_base_path, of), parentds=ds.path, type='file') for of in objfiles if of], dataset=ds, result_renderer='disabled') for qap in to_query_available: # info about the dataset that contains the query path dsinfo = agginfos.get(qap['metaprovider'], dict(id=ds.id)) res_tmpl = get_status_dict() for s, d in (('id', 'dsid'), ('refcommit', 'refcommit')): if s in dsinfo: res_tmpl[d] = dsinfo[s] # pull up dataset metadata, always needed if only for the context dsmeta = {} dsobjloc = dsinfo.get('dataset_info', None) if dsobjloc is not None: dsmeta = _load_json_object( op.join(agg_base_path, dsobjloc), cache=cache['objcache']) for r in _query_aggregated_metadata_singlepath( ds, agginfos, agg_base_path, qap, reporton, cache, dsmeta, dsinfo.get('content_info', None)): r.update(res_tmpl, **kwargs) # if we are coming from `search` we want to record why this is being # reported if 'query_matched' in ap: r['query_matched'] = ap['query_matched'] if r.get('type', None) == 'file': r['parentds'] = op.normpath(op.join(ds.path, qap['metaprovider'])) yield r reported.add(qap['rpath'])
def _get_submodules(ds, paths, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): dspath = ds.path if not GitRepo.is_valid_repo(dspath): return modinfo = _parse_gitmodules(ds) # put in giant for-loop to be able to yield results before completion for sm in _parse_git_submodules(ds, paths): if contains and not any( sm['path'] == c or sm['path'] in c.parents for c in contains): # we are not looking for this subds, because it doesn't # match the target path continue # do we just need this to recurse into subdatasets, or is this a # real results? to_report = paths is None \ or any(p == sm['path'] or p in sm['path'].parents for p in paths) sm.update(modinfo.get(sm['path'], {})) if to_report and (set_property or delete_property): # first deletions for dprop in assure_list(delete_property): try: out, err = ds.repo._git_custom_command( '', ['git', 'config', '--file', '.gitmodules', '--unset-all', 'submodule.{}.{}'.format(sm['gitmodule_name'], dprop), ] ) except CommandError: yield get_status_dict( 'subdataset', status='impossible', message=( "Deleting subdataset property '%s' failed for " "subdataset '%s', possibly did " "not exist", dprop, sm['gitmodule_name']), logger=lgr, **sm) # also kick from the info we just read above sm.pop('gitmodule_{}'.format(dprop), None) # and now setting values for sprop in assure_list(set_property): prop, val = sprop if val.startswith('<') and val.endswith('>') and '{' in val: # expand template string val = val[1:-1].format( **dict( sm, refds_relpath=sm['path'].relative_to(refds_path), refds_relname=text_type( sm['path'].relative_to(refds_path) ).replace(os.sep, '-'))) try: out, err = ds.repo._git_custom_command( '', ['git', 'config', '--file', '.gitmodules', '--replace-all', 'submodule.{}.{}'.format(sm['gitmodule_name'], prop), text_type(val), ] ) except CommandError as e: # pragma: no cover # this conditional may not be possible to reach, as # variable name validity is checked before and Git # replaces the file completely, resolving any permission # issues, if the file could be read (already done above) yield get_status_dict( 'subdataset', status='error', message=( "Failed to set property '%s': %s", prop, exc_str(e)), type='dataset', logger=lgr, **sm) # it is up to parent code to decide whether we would continue # after this # also add to the info we just read above sm['gitmodule_{}'.format(prop)] = val Dataset(dspath).save( '.gitmodules', to_git=True, message='[DATALAD] modified subdataset properties') #common = commonprefix((with_pathsep(subds), with_pathsep(path))) #if common.endswith(sep) and common == with_pathsep(subds): # candidates.append(common) subdsres = get_status_dict( 'subdataset', status='ok', type='dataset', logger=lgr) subdsres.update(sm) subdsres['parentds'] = dspath if to_report and (not bottomup and \ (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled)): yield subdsres # expand list with child submodules. keep all paths relative to parent # and convert jointly at the end if recursive and \ (recursion_limit in (None, 'existing') or (isinstance(recursion_limit, int) and recursion_limit > 1)): for r in _get_submodules( Dataset(sm['path']), paths, fulfilled, recursive, (recursion_limit - 1) if isinstance(recursion_limit, int) else recursion_limit, contains, bottomup, set_property, delete_property, refds_path): yield r if to_report and (bottomup and \ (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled)): yield subdsres
def __call__(message=None, path=None, dataset=None, all_updated=True, version_tag=None, recursive=False, recursion_limit=None, super_datasets=False, message_file=None ): if not dataset and not path: # we got nothing at all -> save what is staged in the repo in "this" directory? # make sure we don't treat this as a user-provided '.' argument path = [{'path': abspath(curdir), 'raw_input': False}] refds_path = Interface.get_refds_path(dataset) if message and message_file: raise ValueError("Both a message and message file were specified") if message_file: with open(message_file, "rb") as mfh: message = assure_unicode(mfh.read()) to_process = [] got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='save', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', modified='HEAD' if not path and recursive else None, return_type='generator', on_failure='ignore'): if ap.get('state', None) == 'untracked' and not ap.get('raw_input', False): # this path was found untracked, but not explicitly given to save # we will silently ignore this continue got_nothing = False # next check should not be done during annotation, as it is possibly expensive # and not generally useful if ap.get('status', None) == 'impossible' and \ ap.get('state', None) == 'absent' and \ ap.get('parentds', None): # this is not here anymore, but it might actually have been a deleted # component if relpath(ap['path'], start=ap['parentds']) \ in Dataset(ap['parentds']).repo.get_deleted_files(): # ok, this is a staged deletion that we want to save ap['status'] = '' del ap['message'] if ap.get('status', None): # this is done yield ap continue # for things like: `ds.save()` # or recursively discovered datasets if ap['path'] == refds_path or \ (ap.get('type', None) == 'dataset' and not ap.get('raw_input', False) and not ap.get('state', None) == 'absent'): ap['process_content'] = True ap['process_updated_only'] = all_updated to_process.append(ap) lgr.log(2, "save, to_process=%r", to_process) if got_nothing and recursive and refds_path: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict( 'save', status='notneeded', path=refds_path, type='dataset', logger=lgr) return if not to_process: # nothing left to do, potentially all errored before return if super_datasets: # search for the topmost superdatasets of any path dss = [Dataset(ap.get('parentds', ap['path'])) for ap in to_process] superdss = [ds.get_superdataset(topmost=True) for ds in dss] superdss = get_tree_roots( unique(ds.path for ds in dss + superdss if ds)) if dataset: # need to adjust the reference to the new superds # if we had one ref before, we should still have exactly one assert len(superdss) <= 1 dataset = list(superdss.keys())[0] refds_path = dataset elif refds_path: # there is a single superdataset superdss = { refds_path: unique([ap['parentds'] for ap in to_process if 'parentds' in ap])} else: # sort all datasets under their potential superdatasets # start from the top to get all subdatasets down the line # and collate them into as few superdatasets as possible # this is quick, just string operations superdss = get_tree_roots( unique([ap['parentds'] for ap in to_process if 'parentds' in ap])) # for each "superdataset" check the tree of subdatasets and make sure # we gather all datasets between the super and any subdataset # so we can save them all bottom-up in order to be able to properly # save the superdataset # if this is called from e.g. `add` this is actually not necessary, # but in the general case we cannot avoid it # TODO maybe introduce a switch? discovered = {} for superds_path in superdss: target_subs = superdss[superds_path] discover_dataset_trace_to_targets( # from here superds_path, # to all target_subs, [], discovered) # create a new minimally annotated path for each discovered dataset discovered_added = set() for parentds in discovered: for subds in discovered[parentds]: to_process.append(dict( path=subds, parentds=parentds, type='dataset')) discovered_added.add(subds) # make sure we have an entry for each dataset, including those # tha are just parents for parentds in discovered: if parentds not in discovered_added: to_process.append(dict( path=parentds, type='dataset', # make sure we save content of superds later on process_content=True, # but not do nasty things, like adding untracked content # just because we discovered this dataset process_updated_only=True)) # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, deduplication happens here too annotated_paths = AnnotatePaths.__call__( path=to_process, dataset=dataset, # never recursion, done already recursive=False, action='save', unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert(not completed) # iterate over all datasets, starting at the bottom for dspath in sorted(content_by_ds.keys(), reverse=True): ds = Dataset(dspath) res = get_status_dict('save', ds=ds, logger=lgr) if not ds.is_installed(): # TODO This is likely impossible now res['status'] = 'impossible' res['message'] = ('dataset %s is not installed', ds) yield res continue saved_state = save_dataset( ds, content_by_ds[dspath], message=message) res['status'] = 'ok' if saved_state else 'notneeded' # MIH: let's tag even if there was nothing commit. I'd forget this # option too often... if version_tag: try: # TODO: check whether comment below is still true after # removing the log swallowing: # again cannot help but force-silence low-level code, because # it screams like a made man instead of allowing top-level # code an orderly error report ds.repo.tag(version_tag) # even if we haven't saved anything res['status'] = 'ok' yield res except CommandError as e: if saved_state: # first we yield the result for the actual save yield res # and now complain that tagging didn't work yield get_status_dict( 'save', ds=ds, logger=lgr, status='error', message=( 'cannot tag this version: %s', e.stderr.strip())) else: yield res
def __call__( path=None, dataset=None, # support passing this through in a path by path basis to_git=None, save=True, message=None, message_file=None, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path") refds_path = Interface.get_refds_path(dataset) common_report = dict(action='add', logger=lgr, refds=refds_path) if message and message_file: raise ValueError("Both a message and message file were specified") if message_file: with open(message_file, "rb") as mfh: message = assure_unicode(mfh.read()) to_add = [] subds_to_add = {} ds_to_annotate_from_recursion = {} got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=dataset, # never recursion, need to handle manually below to be able to # discover untracked content recursive=False, action='add', # speed things up by using Git's modification detection, if there # is a repo with at least one commit modified='HEAD' \ if dataset and \ GitRepo.is_valid_repo(refds_path) and \ GitRepo(refds_path).get_hexsha() \ else None, unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): got_nothing = False if ap.get('status', None): # this is done yield ap continue if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset': yield get_status_dict( status='impossible', message='"there is no dataset to add this path to', **dict(common_report, **ap)) continue if ap.get('type', None) == 'directory' and \ ap.get('state', None) == 'untracked' and \ GitRepo.is_valid_repo(ap['path']): # this is an untracked wannabe subdataset in disguise ap['type'] = 'dataset' if recursive and \ (ap.get('raw_input', False) or ap.get('state', None) in ('added', 'modified', 'untracked')) and \ (ap.get('parentds', None) or ap.get('type', None) == 'dataset'): # this was an actually requested input path, or a path that was found # modified by path annotation, based on an input argument # we need to recurse into all subdirs to find potentially # unregistered subdatasets # but only if this path has a parent, or is itself a dataset # otherwise there is nothing to add to _discover_subdatasets_recursively( ds_to_annotate_from_recursion, ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']], recursion_limit) # get the file content of the root dataset of this search added too # but be careful with extreme recursion_limit settings if recursion_limit is None or recursion_limit > 0: ap['process_content'] = True # record for further processing if not ap['path'] in ds_to_annotate_from_recursion: # if it was somehow already discovered to_add.append(ap) if got_nothing: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict( 'add', status='notneeded', path=refds_path, type='dataset', logger=lgr) return for subds in ds_to_annotate_from_recursion: if subds not in subds_to_add: # always prefer the already annotated path subds_to_add[subds] = ds_to_annotate_from_recursion[subds] if dataset: # we have a base dataset, discover any intermediate datasets between # the base and any already discovered dataset discovered = {} discover_dataset_trace_to_targets( # from here dataset.path, # to any dataset we are aware of subds_to_add.keys(), [], discovered) for parentds in discovered: for subds in discovered[parentds]: subds_to_add[subds] = subds_to_add.get( subds, dict(path=subds, parentds=parentds, type='dataset')) # merge custom paths and discovered dataset records, paths needs to go first, # because we know most about then, and subsequent annotation call we skip the # later duplicate ones to_add.extend(subds_to_add.values()) # and compact, this should be OK as all the info is in each ap dict to_add = unique(to_add, lambda x: x['path']) if not to_add: # nothing left to do, potentially all errored before return # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, it will also deduplicate annotated_paths = AnnotatePaths.__call__( path=to_add, dataset=dataset, # never recursion, done already recursive=False, action='add', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert(not completed) if not content_by_ds: # we should have complained about any inappropriate path argument # above, so if nothing is left, we can simply exit return # simple loop over datasets -- save happens later # start deep down to_save = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) torepoadd = {} respath_by_status = {} for ap in content_by_ds[ds_path]: # we have a new story ap.pop('status', None) torepoadd[ap['path']] = ap # skip anything that doesn't look like a wannabe subdataset if not ap.get('type', None) == 'dataset' or \ ap['path'] == ds_path: continue if ap.get('registered_subds', False): # subdataset that might be in this list because of the # need to save all the way up to a super dataset respath_by_status['success'] = \ respath_by_status.get('success', []) + [ap['path']] yield get_status_dict( status='notneeded', message="already known subdataset", **dict(common_report, **ap)) continue subds = Dataset(ap['path']) subds_relpath = relpath(ap['path'], ds_path) # Register the repository in the repo tree as a submodule try: ds.repo.add_submodule(subds_relpath, url=None, name=None) except (CommandError, InvalidGitRepositoryError) as e: yield get_status_dict( ds=subds, status='error', message=e.stderr, **dict(common_report, **ap)) continue # queue for saving using the updated annotated path ap['registered_subds'] = True # I hope this is true in direct mode too # TODO this is disabled, because in some circumstances # staging just doesn't happen, and it is unclear when # exactly -- the case that prompted disabling was a submodule # that had no content except for other submodules was not staged, # whereas another submodule on the same level in the same # superdataset which also has one file in it was staged # disable to work correctly, while paying a little bit of # slow down #ap['staged'] = True to_save.append(ap) # report added subdatasets -- `annex add` below won't do it yield get_status_dict( ds=subds, status='ok', message='added new subdataset', **dict(common_report, **ap)) # make sure that .gitmodules is added to the list of files gitmodules_path = opj(ds.path, '.gitmodules') # for git torepoadd[gitmodules_path] = dict(path=gitmodules_path) # and for save to_save.append(dict( path=gitmodules_path, parentds=ds_path, type='file')) # make sure any last minute additions make it to the saving stage # XXX? should content_by_ds become OrderedDict so that possible # super here gets processed last? lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd) is_annex = isinstance(ds.repo, AnnexRepo) add_kw = {'jobs': jobs} if is_annex and jobs else {} added = ds.repo.add_( list(torepoadd.keys()), git=to_git if is_annex else True, **add_kw ) for a in added: res = annexjson2result(a, ds, type='file', **common_report) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] # produce best possible path/result annotation if res['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report res = dict(torepoadd[res['path']], **res) # override this in all cases to be safe res['parentds'] = ds.path if success: # this was successfully added, queue for saving this very path # in the dataset ap = {k: v for k, v in res.items() if k != 'status'} ap['staged'] = True # strip any status and state info (e.g. save will refuse to save # stuff that is marked state='untracked' to_save.append({k: v for k, v in res.items() if k not in ('status', 'state')}) if a['file'] == '.gitmodules': # filter out .gitmodules, because this is only included for # technical reasons and has nothing to do with the actual content continue if GitRepo.is_valid_repo(res['path']): # more accurate report in case of an added submodule # mountpoint. # XXX Actually not sure if this can really happen # (depends on what our low-level code would do) # but worst case is that we loose a little bit of # coverage... res['type'] = 'dataset' res['message'] = 'added new state as submodule' yield res for r in results_from_annex_noinfo( ds, torepoadd, respath_by_status, dir_fail_msg='could not add some content in %s %s', noinfo_dir_msg='nothing to add from %s', noinfo_file_msg='already included in the dataset', action='add', logger=lgr, refds=refds_path): if r['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report r = dict(r, **torepoadd[r['path']]) if r['status'] == 'notneeded': # this could be a file that was staged already, it doesn't need # to be added, but it should be saved/commited if so desired to_save.append({k: v for k, v in r.items() if k not in ('status', 'state')}) # XXX something is fishy with the next one, rethink when sober.... if r['path'] == ds_path and r['status'] == 'ok': # this is for the entire dataset itself which was explicitly requested # make sure to save all r['type'] = 'dataset' r['process_content'] = True to_save.append({k: v for k, v in r.items() if k != 'status'}) yield r if refds_path and ds_path != refds_path and len(respath_by_status.get('success', [])): # TODO XXX we have an issue here when with `add('.')` and annex ignores any # dotfiles. In this case we end up not saving a dataset completely, because # we rely on accurate reporting. there is an issue about this already # TODO look up the issue ID # if there is a base dataset, but we are below it, and we have anything done to this # dataset -> queue dataset itself for saving its state in the parent ds_ap = dict( path=ds.path, # we have to look for the parent here, as we must save the # subdataset in the parent and not the whole subdataset itself type='dataset') parentds = get_dataset_root(normpath(opj(ds.path, pardir))) if parentds: ds_ap['parentds'] = parentds if dataset: ds_ap['refds'] = refds_path to_save.append(ds_ap) if not save: lgr.debug('Not calling `save` as instructed') return # TODO tell save what was staged already! Set 'staged=True' for # respective annotated paths that are fed into `save` # do not reuse any of the sorting done in here for saving, but instead # pass on all the annotated paths to have `save` figure out what to do with # them -- this is costs something, but should be safer, and frankly is # more comprehensible for res in Save.__call__( # hand-selected annotated paths path=to_save, dataset=refds_path, message=message if message else '[DATALAD] added content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None, explicit=False, message=None, sidecar=None, extra_info=None, rerun_info=None, extra_inputs=None, rerun_outputs=None, inject=False, saver=_save_outputs): """Run `cmd` in `dataset` and record the results. `Run.__call__` is a simple wrapper over this function. Aside from backward compatibility kludges, the only difference is that `Run.__call__` doesn't expose all the parameters of this function. The unexposed parameters are listed below. Parameters ---------- extra_info : dict, optional Additional information to dump with the json run record. Any value given here will take precedence over the standard run key. Warning: To avoid collisions with future keys added by `run`, callers should try to use fairly specific key names and are encouraged to nest fields under a top-level "namespace" key (e.g., the project or extension name). rerun_info : dict, optional Record from a previous run. This is used internally by `rerun`. extra_inputs : list, optional Inputs to use in addition to those specified by `inputs`. Unlike `inputs`, these will not be injected into the {inputs} format field. rerun_outputs : list, optional Outputs, in addition to those in `outputs`, determined automatically from a previous run. This is used internally by `rerun`. inject : bool, optional Record results as if a command was run, skipping input and output preparation and command execution. In this mode, the caller is responsible for ensuring that the state of the working tree is appropriate for recording the command's results. saver : callable, optional Must take a dataset instance, a list of paths to save, and a message string as arguments and must record any changes done to any content matching an entry in the path list. Must yield result dictionaries as a generator. Yields ------ Result records for the run. """ if not cmd: lgr.warning("No command given") return rel_pwd = rerun_info.get('pwd') if rerun_info else None if rel_pwd and dataset: # recording is relative to the dataset pwd = normpath(opj(dataset.path, rel_pwd)) rel_pwd = relpath(pwd, dataset.path) else: pwd, rel_pwd = get_command_pwds(dataset) ds = require_dataset( dataset, check_installed=True, purpose='tracking outcomes of a command') # not needed ATM #refds_path = ds.path lgr.debug('tracking command output underneath %s', ds) if not (rerun_info or inject): # Rerun already takes care of this. # For explicit=True, we probably want to check whether any inputs have # modifications. However, we can't just do is_dirty(..., path=inputs) # because we need to consider subdatasets and untracked files. if not explicit and ds.repo.dirty: yield get_status_dict( 'run', ds=ds, status='impossible', message=('unsaved modifications present, ' 'cannot detect changes by command')) return cmd = normalize_command(cmd) inputs = GlobbedPaths(inputs, pwd=pwd, expand=expand in ["inputs", "both"]) extra_inputs = GlobbedPaths(extra_inputs, pwd=pwd, # Follow same expansion rules as `inputs`. expand=expand in ["inputs", "both"]) outputs = GlobbedPaths(outputs, pwd=pwd, expand=expand in ["outputs", "both"]) if not inject: for res in prepare_inputs(ds, inputs, extra_inputs): yield res if outputs: for res in _install_and_reglob(ds, outputs): yield res for res in _unlock_or_remove(ds, outputs.expand(full=True)): yield res if rerun_outputs is not None: # These are files we need to unlock/remove for a rerun that aren't # included in the explicit outputs. Unlike inputs/outputs, these are # full paths, so we can pass them directly to unlock. for res in _unlock_or_remove(ds, rerun_outputs): yield res else: # If an inject=True caller wants to override the exit code, they can do # so in extra_info. cmd_exitcode = 0 exc = None try: cmd_expanded = format_command(ds, cmd, pwd=pwd, dspath=ds.path, inputs=inputs, outputs=outputs) except KeyError as exc: yield get_status_dict( 'run', ds=ds, status='impossible', message=('command has an unrecognized placeholder: %s', exc)) return if not inject: cmd_exitcode, exc = _execute_command( cmd_expanded, pwd, expected_exit=rerun_info.get("exit", 0) if rerun_info else None) # amend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, 'exit': cmd_exitcode, 'chain': rerun_info["chain"] if rerun_info else [], 'inputs': inputs.paths, 'extra_inputs': extra_inputs.paths, 'outputs': outputs.paths, } if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd if ds.id: run_info["dsid"] = ds.id if extra_info: run_info.update(extra_info) record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False) if sidecar is None: use_sidecar = ds.config.get('datalad.run.record-sidecar', default=False) # If ConfigManager gets the ability to say "return single value", # update this code to use that. if isinstance(use_sidecar, tuple): # Use same precedence as 'git config'. use_sidecar = use_sidecar[-1] use_sidecar = anything2bool(use_sidecar) else: use_sidecar = sidecar if use_sidecar: # record ID is hash of record itself from hashlib import md5 record_id = md5(record.encode('utf-8')).hexdigest() record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo')) record_path = op.join(ds.path, record_dir, record_id) if not op.lexists(record_path): # go for compression, even for minimal records not much difference, despite offset cost # wrap in list -- there is just one record dump2stream([run_info], record_path, compressed=True) # compose commit message msg = u"""\ [DATALAD RUNCMD] {} === Do not change lines below === {} ^^^ Do not change lines above ^^^ """ msg = msg.format( message if message is not None else _format_cmd_shorty(cmd_expanded), '"{}"'.format(record_id) if use_sidecar else record) outputs_to_save = outputs.expand(full=True) if explicit else '.' if not rerun_info and cmd_exitcode: if outputs_to_save: msg_path = relpath(opj(ds.repo.path, ds.repo.get_git_dir(ds.repo), "COMMIT_EDITMSG")) with open(msg_path, "wb") as ofh: ofh.write(assure_bytes(msg)) lgr.info("The command had a non-zero exit code. " "If this is expected, you can save the changes with " "'datalad add -d . -r -F %s .'", msg_path) raise exc elif outputs_to_save: for r in saver(ds, outputs_to_save, msg): yield r