def generator_func(*_args, **_kwargs): # flag whether to raise an exception incomplete_results = [] # track what actions were performed how many times action_summary = {} if proc_pre and cmdline_name != 'run-procedure': from datalad.interface.run_procedure import RunProcedure for procspec in proc_pre: lgr.debug('Running configured pre-procedure %s', procspec) for r in _process_results( RunProcedure.__call__( procspec, dataset=dataset_arg, return_type='generator'), _func_class, action_summary, on_failure, incomplete_results, result_renderer, result_xfm, result_filter, **_kwargs): yield r # process main results for r in _process_results( wrapped(*_args, **_kwargs), _func_class, action_summary, on_failure, incomplete_results, result_renderer, result_xfm, _result_filter, **_kwargs): yield r if proc_post and cmdline_name != 'run-procedure': from datalad.interface.run_procedure import RunProcedure for procspec in proc_post: lgr.debug('Running configured post-procedure %s', procspec) for r in _process_results( RunProcedure.__call__( procspec, dataset=dataset_arg, return_type='generator'), _func_class, action_summary, on_failure, incomplete_results, result_renderer, result_xfm, result_filter, **_kwargs): yield r # result summary before a potential exception if result_renderer == 'default' and action_summary and \ sum(sum(s.values()) for s in action_summary.values()) > 1: # give a summary in default mode, when there was more than one # action performed ui.message("action summary:\n {}".format( '\n '.join('{} ({})'.format( act, ', '.join('{}: {}'.format(status, action_summary[act][status]) for status in sorted(action_summary[act]))) for act in sorted(action_summary)))) if incomplete_results: raise IncompleteResultsError( failed=incomplete_results, msg="Command did not complete successfully")
def _handle_and_return_installed_items(ds, installed_items, failed_items, save): if save and ds is not None: _save_installed_datasets(ds, installed_items) if failed_items: msg = '' for act, l in (("succeeded", installed_items), ("failed", failed_items)): if not l: continue if msg: msg += ', and ' msg += "%s %s" % (single_or_plural( "dataset", "datasets", len(l), include_count=True), act) if ds: paths = [ relpath(i.path, ds.path) if hasattr(i, 'path') else i if not i.startswith(ds.path) else relpath(i, ds.path) for i in l ] else: paths = l msg += " (%s)" % (", ".join(map(str, paths))) msg += ' to install' # we were asked for multiple installations if installed_items or len(failed_items) > 1: raise IncompleteResultsError(results=installed_items, failed=failed_items, msg=msg) else: raise InstallFailedError(msg=msg) return installed_items[0] \ if len(installed_items) == 1 else installed_items
def generator_func(*_args, **_kwargs): # flag whether to raise an exception incomplete_results = [] # track what actions were performed how many times action_summary = {} # TODO needs replacement plugin is gone #for pluginspec in run_before or []: # lgr.debug('Running pre-proc plugin %s', pluginspec) # for r in _process_results( # Plugin.__call__( # pluginspec, # dataset=allkwargs.get('dataset', None), # return_type='generator'), # _func_class, action_summary, # on_failure, incomplete_results, # result_renderer, result_xfm, result_filter, # **_kwargs): # yield r # process main results for r in _process_results( wrapped(*_args, **_kwargs), _func_class, action_summary, on_failure, incomplete_results, result_renderer, result_xfm, _result_filter, **_kwargs): yield r # TODO needs replacement plugin is gone #for pluginspec in run_after or []: # lgr.debug('Running post-proc plugin %s', pluginspec) # for r in _process_results( # Plugin.__call__( # pluginspec, # dataset=allkwargs.get('dataset', None), # return_type='generator'), # _func_class, action_summary, # on_failure, incomplete_results, # result_renderer, result_xfm, result_filter, # **_kwargs): # yield r # result summary before a potential exception if result_renderer == 'default' and action_summary and \ sum(sum(s.values()) for s in action_summary.values()) > 1: # give a summary in default mode, when there was more than one # action performed ui.message("action summary:\n {}".format( '\n '.join('{} ({})'.format( act, ', '.join('{}: {}'.format(status, action_summary[act][status]) for status in sorted(action_summary[act]))) for act in sorted(action_summary)))) if incomplete_results: raise IncompleteResultsError( failed=incomplete_results, msg="Command did not complete successfully")
def try_data_download(file_fetch_list, test_data_dir): global lock dl_dset = datalad.Dataset(str(test_data_dir)) attempt_count = 0 lock.acquire() while attempt_count < 2: try: # Fetching the data process_for_fetching_data = Process( target=dl_dset.get, kwargs={"path": [str(p) for p in file_fetch_list]}) # attempts should be timed-out to deal with of unpredictable stalls. process_for_fetching_data.start() process_for_fetching_data.join(timeout=30) if process_for_fetching_data.is_alive(): # terminate the process. process_for_fetching_data.terminate() raise IncompleteResultsError( f"Data fetching timed out for {file_fetch_list}") elif process_for_fetching_data.exitcode != 0: raise ValueError(f"Data fetching failed for {file_fetch_list}") else: lock.release() return except (IncompleteResultsError, CommandError) as e: # Try another loop attempt_count += 1 # make sure datalad repo wasn't updated to git annex version 8. Not sure why this is happening git_config_file = Path(test_data_dir) / ".git" / "config" git_config_file.write_text(git_config_file.read_text().replace( "version = 8", "version = 7")) continue # datalad download attempts failed pytest.exit( "Datalad download failed 5 times, you may not be connected to the internet" )
def generator_func(*_args, **_kwargs): # flag whether to raise an exception incomplete_results = [] # track what actions were performed how many times action_summary = {} # if a custom summary is to be provided, collect the results # of the command execution results = [] do_custom_result_summary = result_renderer in ('tailored', 'default') \ and hasattr(wrapped_class, 'custom_result_summary_renderer') pass_summary = do_custom_result_summary and \ getattr(wrapped_class, 'custom_result_summary_renderer_pass_summary', None) # process main results for r in _process_results( # execution wrapped(*_args, **_kwargs), wrapped_class, common_params['on_failure'], # bookkeeping action_summary, incomplete_results, # communication result_renderer, result_log_level, # let renderers get to see how a command was called allkwargs): for hook, spec in hooks.items(): # run the hooks before we yield the result # this ensures that they are executed before # a potentially wrapper command gets to act # on them if match_jsonhook2result(hook, r, spec['match']): lgr.debug('Result %s matches hook %s', r, hook) # a hook is also a command that yields results # so yield them outside too # users need to pay attention to void infinite # loops, i.e. when a hook yields a result that # triggers that same hook again for hr in run_jsonhook(hook, spec, r, dataset_arg): # apply same logic as for main results, otherwise # any filters would only tackle the primary results # and a mixture of return values could happen if not keep_result(hr, result_filter, **allkwargs): continue hr = xfm_result(hr, result_xfm) # rationale for conditional is a few lines down if hr: yield hr if not keep_result(r, result_filter, **allkwargs): continue r = xfm_result(r, result_xfm) # in case the result_xfm decided to not give us anything # exclude it from the results. There is no particular reason # to do so other than that it was established behavior when # this comment was written. This will not affect any real # result record if r: yield r # collect if summary is desired if do_custom_result_summary: results.append(r) # result summary before a potential exception # custom first if do_custom_result_summary: if pass_summary: summary_args = (results, action_summary) else: summary_args = (results, ) wrapped_class.custom_result_summary_renderer(*summary_args) elif result_renderer == 'default' and action_summary and \ sum(sum(s.values()) for s in action_summary.values()) > 1: # give a summary in default mode, when there was more than one # action performed render_action_summary(action_summary) if incomplete_results: raise IncompleteResultsError( failed=incomplete_results, msg="Command did not complete successfully")
def __call__( path, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, reckless=False, git_opts=None, annex_opts=None, annex_get_opts=None, jobs=None, verbose=False, # internal -- instead of returning 'get'ed items, return final # content_by_ds, unavailable_paths. To be used by the call from # Install.__call__ and done so to avoid creating another reusable # function which would need to duplicate all this heavy list of # kwargs _return_datasets=False): # IMPLEMENTATION CONCEPT: # # 1. turn all input paths into absolute paths # 2. Sort the world into existing handles and the rest # 3. Try locate missing handles (obtain subdatasets along the way) # 4. Expand into subdatasets with recursion enables (potentially # obtain even more subdatasets # 5. Shoot info of which handles to get in each subdataset to, # git-annex, once at the very end # TODO: consider allowing an empty `path` argument, as with other commands, # to indicate CWD resolved_paths, dataset_path = get_normalized_path_arguments( path, dataset, default=None) if not resolved_paths: raise InsufficientArgumentsError( "`get` needs at least one path as argument") # sort paths into the respective datasets dir_lookup = {} content_by_ds, unavailable_paths, nondataset_paths = \ get_paths_by_dataset(resolved_paths, recursive=recursive, recursion_limit=recursion_limit, dir_lookup=dir_lookup) lgr.debug( "Found %i existing dataset(s) to get content in " "and %d unavailable paths", len(content_by_ds), len(unavailable_paths)) # IMPORTANT NOTE re `content_by_ds` # each key is a subdataset that we need to get something in # if the value[0] is the subdataset's path, we want all of it # if the value[0] == curdir, we just installed it as part of # resolving file handles and we did not say anything but "give # me the dataset handle" # explore the unknown for path in sorted(unavailable_paths): # how close can we get? dspath = GitRepo.get_toppath(path) if dspath is None: # nothing we can do for this path continue ds = Dataset(dspath) # must always yield a dataset -- we sorted out the ones outside # any dataset at the very top assert ds.is_installed() # now actually obtain whatever is necessary to get to this path containing_ds = install_necessary_subdatasets(ds, path, reckless) if containing_ds.path != ds.path: lgr.debug( "Installed %s to fulfill request for content for " "path %s", containing_ds, path) # mark resulting dataset as auto-installed if containing_ds.path == path: # we had to get the entire dataset, not something within # mark that it just appeared content_by_ds[path] = [curdir] else: # we need to get content within content_by_ds[path] = [path] if recursive and not recursion_limit == 'existing': # obtain any subdatasets underneath the paths given inside the # subdatasets that we know already exist # unless we do not want recursion into not-yet-installed datasets for subdspath in sorted(content_by_ds.keys()): for content_path in content_by_ds[subdspath]: if not isdir(content_path): # a non-directory cannot have content underneath continue subds = Dataset(subdspath) lgr.info( "Obtaining %s %s recursively", subds, ("underneath %s" % content_path if subds.path != content_path else "")) cbysubds = _recursive_install_subds_underneath( subds, # `content_path` was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, # protect against magic marker misinterpretation # only relevant for _get, hence replace here start=content_path if content_path != curdir else None) # gets file content for all freshly installed subdatasets content_by_ds.update(cbysubds) ## we have now done everything we could to obtain whatever subdataset ## to get something on the file system for previously unavailable paths ## check and sort one last content_by_ds, unavailable_paths, nondataset_paths2 = \ get_paths_by_dataset( unavailable_paths, recursive=recursive, recursion_limit=recursion_limit, out=content_by_ds, dir_lookup=dir_lookup) nondataset_paths.extend(nondataset_paths2) if nondataset_paths: lgr.warning("ignored paths that do not belong to any dataset: %s", nondataset_paths) if unavailable_paths: lgr.warning('ignored non-existing paths: %s', unavailable_paths) # hand over to git-annex results = list( chain.from_iterable( _get(content_by_ds, refpath=dataset_path, source=source, jobs=jobs, get_data=get_data))) # ??? should we in _return_datasets case just return both content_by_ds # and unavailable_paths may be so we provide consistent across runs output # and then issue outside similar IncompleteResultsError? if unavailable_paths: # and likely other error flags if _return_datasets: results = sorted( set(content_by_ds).difference(unavailable_paths)) raise IncompleteResultsError(results, failed=unavailable_paths) else: return sorted(content_by_ds) if _return_datasets else results
def __call__( path=None, dataset=None, to=None, since=None, skip_failing=False, recursive=False, recursion_limit=None, git_opts=None, annex_opts=None, annex_copy_opts=None): # shortcut ds = require_dataset(dataset, check_installed=True, purpose='publication') assert(ds.repo is not None) path = assure_list(path) # figure out, what to publish from what (sub)dataset: publish_this = False # whether to publish `ds` publish_files = [] # which files to publish by `ds` expl_subs = set() # subdatasets to publish explicitly publish_subs = dict() # collect what to publish from subdatasets if not path: # publish `ds` itself, if nothing else is given: publish_this = True else: for p in path: subdatasets = ds.get_subdatasets() if p in subdatasets: # p is a subdataset, that needs to be published itself expl_subs.add(p) else: try: d = ds.get_containing_subdataset(p) except ValueError as e: # p is not in ds => skip: lgr.warning(str(e) + " - Skipped.") continue if d == ds: # p needs to be published from ds publish_this = True publish_files.append(p) else: # p belongs to subds `d` if not publish_subs[d.path]: publish_subs[d.path] = dict() if not publish_subs[d.d.path]['files']: publish_subs[d.d.path]['files'] = list() publish_subs[d.path]['dataset'] = d publish_subs[d.path]['files'].append(p) if publish_this: # Note: we need an upstream remote, if there's none given. We could # wait for git push to complain, but we need to explicitly figure it # out for pushing annex branch anyway and we might as well fail # right here. track_remote, track_branch = None, None # keep `to` in case it's None for passing to recursive calls: dest_resolved = to if to is None: # TODO: If possible, avoid resolution herein and rely on git # (or GitRepo respectively), meaning: Just pass `None` # ATM conflicts with _get_changed_datasets => figure it out track_remote, track_branch = ds.repo.get_tracking_branch() if track_remote: dest_resolved = track_remote else: # we have no remote given and no upstream => fail raise InsufficientArgumentsError( "No known default target for " "publication and none given.") subds_prev_hexsha = {} if recursive: all_subdatasets = ds.get_subdatasets(fulfilled=True) # TODO: dest_resolved => to? # Note: This is a bug anyway, since in actual recursive call `to` is # passed in order to be resolved by the subdatasets themselves # (might be None), but when considering what subdatasets to be # published, we assume `dest_resolved` is the same for all of them. # ==> TODO: RF to consider `since` only for the current ds and then go on # recursively. subds_to_consider = \ Publish._get_changed_datasets( ds.repo, all_subdatasets, dest_resolved, since=since) \ if publish_this \ else all_subdatasets # if we were returned a dict, we got subds_prev_hexsha if isinstance(subds_to_consider, dict): subds_prev_hexsha = subds_to_consider for subds_path in subds_to_consider: if path and '.' in path: # we explicitly are passing '.' to subdatasets in case of # `recursive`. Therefore these datasets are going into # `publish_subs`, instead of `expl_subs`: sub = Dataset(opj(ds.path, subds_path)) publish_subs[sub.path] = dict() publish_subs[sub.path]['dataset'] = sub publish_subs[sub.path]['files'] = ['.'] else: # we can recursively publish only, if there actually # is something expl_subs.add(subds_path) published, skipped = [], [] for dspath in sorted(expl_subs): # these datasets need to be pushed regardless of additional paths # pointing inside them # due to API, this may not happen when calling publish with paths, # therefore force it. # TODO: There might be a better solution to avoid two calls of # publish() on the very same Dataset instance ds_ = Dataset(opj(ds.path, dspath)) try: # we could take local diff for the subdataset # but may be we could just rely on internal logic within # subdataset to figure out what it needs to publish. # But we need to pass empty string one inside as is pkw = {} if since == '': pkw['since'] = since else: # pass previous state for that submodule if known pkw['since'] = subds_prev_hexsha.get(dspath, None) published_, skipped_ = ds_.publish(to=to, recursive=recursive, **pkw) published += published_ skipped += skipped_ except Exception as exc: if not skip_failing: raise lgr.warning("Skipped %s: %s", ds.path, exc_str(exc)) skipped += [ds_] for d in publish_subs: # recurse into subdatasets # TODO: need to fetch. see above publish_subs[d]['dataset'].repo.fetch(remote=to) published_, skipped_ = publish_subs[d]['dataset'].publish( to=to, path=publish_subs[d]['files'], recursive=recursive, annex_copy_opts=annex_copy_opts) published += published_ skipped += skipped_ if publish_this: # is `to` an already known remote? if dest_resolved not in ds.repo.get_remotes(): # unknown remote raise ValueError("No sibling '{0}' found for {1}." "".format(dest_resolved, ds)) # in order to be able to use git's config to determine what to push, # we need to annex merge first. Otherwise a git push might be # rejected if involving all matching branches for example. # Once at it, also push the annex branch right here. # Q: Do we need to respect annex-ignore here? Does it make sense to # publish to a remote without pushing the annex branch # (if there is any)? if isinstance(ds.repo, AnnexRepo): ds.repo.fetch(remote=dest_resolved) ds.repo.merge_annex(dest_resolved) _log_push_info(ds.repo.push(remote=dest_resolved, refspec="git-annex:git-annex")) # upstream branch needed for update (merge) and subsequent push, # in case there is no. # no tracking branch yet? set_upstream = track_branch is None # publishing of `dest_resolved` might depend on publishing other # remote(s) first: # define config var name for potential publication dependencies depvar = 'remote.{}.datalad-publish-depends'.format(dest_resolved) for d in ds.config.get(depvar, []): lgr.info("Dependency detected: '%s'" % d) # Note: Additional info on publishing the dep. comes from within # `ds.publish`. ds.publish(path=path, to=d, since=since, skip_failing=skip_failing, recursive=recursive, recursion_limit=recursion_limit, git_opts=git_opts, annex_opts=annex_opts, annex_copy_opts=annex_copy_opts) lgr.info("Publishing {0} to {1}".format(ds, dest_resolved)) # we now know where to push to: # TODO: what to push? default: git push --mirror if nothing configured? # consider also: --follow-tags, --tags, --atomic # Note: git's push.default is 'matching', which possibly doesn't # work for first # time publication (a branch, that doesn't exist on remote yet) # But if we want to respect remote.*.push entries, etc. we need to # not pass a specific refspec (like active branch) to `git push` # by default. _log_push_info(ds.repo.push(remote=dest_resolved, refspec=ds.repo.get_active_branch(), set_upstream=set_upstream)) published.append(ds) if publish_files or annex_copy_opts: if not isinstance(ds.repo, AnnexRepo): # incomplete, since `git push` was done already: raise IncompleteResultsError( (published, skipped), failed=publish_files, msg="Cannot publish content of something, that is not " "an annex. ({0})".format(ds)) if ds.config.get('remote.{}.annex-ignore', False): # Q: Do we need a --force option here? annex allows to # ignore the ignore setting raise IncompleteResultsError( (published, skipped), failed=publish_files, msg="Sibling '{0}' of {1} is configured to be ignored " "by annex. No content was published." % (dest_resolved, ds)) lgr.info("Publishing data of dataset {0} ...".format(ds)) published += ds.repo.copy_to(files=publish_files, remote=dest_resolved, options=annex_copy_opts) return published, skipped