Beispiel #1
0
        def generator_func(*_args, **_kwargs):
            # flag whether to raise an exception
            incomplete_results = []
            # track what actions were performed how many times
            action_summary = {}

            if proc_pre and cmdline_name != 'run-procedure':
                from datalad.interface.run_procedure import RunProcedure
                for procspec in proc_pre:
                    lgr.debug('Running configured pre-procedure %s', procspec)
                    for r in _process_results(
                            RunProcedure.__call__(
                                procspec,
                                dataset=dataset_arg,
                                return_type='generator'),
                            _func_class, action_summary,
                            on_failure, incomplete_results,
                            result_renderer, result_xfm, result_filter,
                            **_kwargs):
                        yield r

            # process main results
            for r in _process_results(
                    wrapped(*_args, **_kwargs),
                    _func_class, action_summary,
                    on_failure, incomplete_results,
                    result_renderer, result_xfm, _result_filter, **_kwargs):
                yield r

            if proc_post and cmdline_name != 'run-procedure':
                from datalad.interface.run_procedure import RunProcedure
                for procspec in proc_post:
                    lgr.debug('Running configured post-procedure %s', procspec)
                    for r in _process_results(
                            RunProcedure.__call__(
                                procspec,
                                dataset=dataset_arg,
                                return_type='generator'),
                            _func_class, action_summary,
                            on_failure, incomplete_results,
                            result_renderer, result_xfm, result_filter,
                            **_kwargs):
                        yield r

            # result summary before a potential exception
            if result_renderer == 'default' and action_summary and \
                    sum(sum(s.values()) for s in action_summary.values()) > 1:
                # give a summary in default mode, when there was more than one
                # action performed
                ui.message("action summary:\n  {}".format(
                    '\n  '.join('{} ({})'.format(
                        act,
                        ', '.join('{}: {}'.format(status, action_summary[act][status])
                                  for status in sorted(action_summary[act])))
                                for act in sorted(action_summary))))

            if incomplete_results:
                raise IncompleteResultsError(
                    failed=incomplete_results,
                    msg="Command did not complete successfully")
Beispiel #2
0
    def _handle_and_return_installed_items(ds, installed_items, failed_items,
                                           save):
        if save and ds is not None:
            _save_installed_datasets(ds, installed_items)
        if failed_items:
            msg = ''
            for act, l in (("succeeded", installed_items), ("failed",
                                                            failed_items)):
                if not l:
                    continue
                if msg:
                    msg += ', and '
                msg += "%s %s" % (single_or_plural(
                    "dataset", "datasets", len(l), include_count=True), act)
                if ds:
                    paths = [
                        relpath(i.path, ds.path) if hasattr(i, 'path') else i
                        if not i.startswith(ds.path) else relpath(i, ds.path)
                        for i in l
                    ]
                else:
                    paths = l
                msg += " (%s)" % (", ".join(map(str, paths)))
            msg += ' to install'

            # we were asked for multiple installations
            if installed_items or len(failed_items) > 1:
                raise IncompleteResultsError(results=installed_items,
                                             failed=failed_items,
                                             msg=msg)
            else:
                raise InstallFailedError(msg=msg)

        return installed_items[0] \
            if len(installed_items) == 1 else installed_items
Beispiel #3
0
        def generator_func(*_args, **_kwargs):
            # flag whether to raise an exception
            incomplete_results = []
            # track what actions were performed how many times
            action_summary = {}

            # TODO needs replacement plugin is gone
            #for pluginspec in run_before or []:
            #    lgr.debug('Running pre-proc plugin %s', pluginspec)
            #    for r in _process_results(
            #            Plugin.__call__(
            #                pluginspec,
            #                dataset=allkwargs.get('dataset', None),
            #                return_type='generator'),
            #            _func_class, action_summary,
            #            on_failure, incomplete_results,
            #            result_renderer, result_xfm, result_filter,
            #            **_kwargs):
            #        yield r

            # process main results
            for r in _process_results(
                    wrapped(*_args, **_kwargs),
                    _func_class, action_summary,
                    on_failure, incomplete_results,
                    result_renderer, result_xfm, _result_filter, **_kwargs):
                yield r

            # TODO needs replacement plugin is gone
            #for pluginspec in run_after or []:
            #    lgr.debug('Running post-proc plugin %s', pluginspec)
            #    for r in _process_results(
            #            Plugin.__call__(
            #                pluginspec,
            #                dataset=allkwargs.get('dataset', None),
            #                return_type='generator'),
            #            _func_class, action_summary,
            #            on_failure, incomplete_results,
            #            result_renderer, result_xfm, result_filter,
            #            **_kwargs):
            #        yield r

            # result summary before a potential exception
            if result_renderer == 'default' and action_summary and \
                    sum(sum(s.values()) for s in action_summary.values()) > 1:
                # give a summary in default mode, when there was more than one
                # action performed
                ui.message("action summary:\n  {}".format(
                    '\n  '.join('{} ({})'.format(
                        act,
                        ', '.join('{}: {}'.format(status, action_summary[act][status])
                                  for status in sorted(action_summary[act])))
                                for act in sorted(action_summary))))

            if incomplete_results:
                raise IncompleteResultsError(
                    failed=incomplete_results,
                    msg="Command did not complete successfully")
Beispiel #4
0
def try_data_download(file_fetch_list, test_data_dir):
    global lock
    dl_dset = datalad.Dataset(str(test_data_dir))
    attempt_count = 0
    lock.acquire()
    while attempt_count < 2:
        try:
            # Fetching the data
            process_for_fetching_data = Process(
                target=dl_dset.get,
                kwargs={"path": [str(p) for p in file_fetch_list]})

            # attempts should be timed-out to deal with of unpredictable stalls.
            process_for_fetching_data.start()
            process_for_fetching_data.join(timeout=30)
            if process_for_fetching_data.is_alive():
                # terminate the process.
                process_for_fetching_data.terminate()
                raise IncompleteResultsError(
                    f"Data fetching timed out for {file_fetch_list}")
            elif process_for_fetching_data.exitcode != 0:
                raise ValueError(f"Data fetching failed for {file_fetch_list}")
            else:
                lock.release()
                return
        except (IncompleteResultsError, CommandError) as e:
            # Try another loop
            attempt_count += 1
            # make sure datalad repo wasn't updated to git annex version 8. Not sure why this is happening
            git_config_file = Path(test_data_dir) / ".git" / "config"
            git_config_file.write_text(git_config_file.read_text().replace(
                "version = 8", "version = 7"))
            continue

    # datalad download attempts failed
    pytest.exit(
        "Datalad download failed 5 times, you may not be connected to the internet"
    )
Beispiel #5
0
        def generator_func(*_args, **_kwargs):
            # flag whether to raise an exception
            incomplete_results = []
            # track what actions were performed how many times
            action_summary = {}

            # if a custom summary is to be provided, collect the results
            # of the command execution
            results = []
            do_custom_result_summary = result_renderer in ('tailored', 'default') \
                and hasattr(wrapped_class, 'custom_result_summary_renderer')
            pass_summary = do_custom_result_summary and \
                getattr(wrapped_class,
                        'custom_result_summary_renderer_pass_summary', None)

            # process main results
            for r in _process_results(
                    # execution
                    wrapped(*_args, **_kwargs),
                    wrapped_class,
                    common_params['on_failure'],
                    # bookkeeping
                    action_summary,
                    incomplete_results,
                    # communication
                    result_renderer,
                    result_log_level,
                    # let renderers get to see how a command was called
                    allkwargs):
                for hook, spec in hooks.items():
                    # run the hooks before we yield the result
                    # this ensures that they are executed before
                    # a potentially wrapper command gets to act
                    # on them
                    if match_jsonhook2result(hook, r, spec['match']):
                        lgr.debug('Result %s matches hook %s', r, hook)
                        # a hook is also a command that yields results
                        # so yield them outside too
                        # users need to pay attention to void infinite
                        # loops, i.e. when a hook yields a result that
                        # triggers that same hook again
                        for hr in run_jsonhook(hook, spec, r, dataset_arg):
                            # apply same logic as for main results, otherwise
                            # any filters would only tackle the primary results
                            # and a mixture of return values could happen
                            if not keep_result(hr, result_filter, **allkwargs):
                                continue
                            hr = xfm_result(hr, result_xfm)
                            # rationale for conditional is a few lines down
                            if hr:
                                yield hr
                if not keep_result(r, result_filter, **allkwargs):
                    continue
                r = xfm_result(r, result_xfm)
                # in case the result_xfm decided to not give us anything
                # exclude it from the results. There is no particular reason
                # to do so other than that it was established behavior when
                # this comment was written. This will not affect any real
                # result record
                if r:
                    yield r

                # collect if summary is desired
                if do_custom_result_summary:
                    results.append(r)

            # result summary before a potential exception
            # custom first
            if do_custom_result_summary:
                if pass_summary:
                    summary_args = (results, action_summary)
                else:
                    summary_args = (results, )
                wrapped_class.custom_result_summary_renderer(*summary_args)
            elif result_renderer == 'default' and action_summary and \
                    sum(sum(s.values()) for s in action_summary.values()) > 1:
                # give a summary in default mode, when there was more than one
                # action performed
                render_action_summary(action_summary)

            if incomplete_results:
                raise IncompleteResultsError(
                    failed=incomplete_results,
                    msg="Command did not complete successfully")
Beispiel #6
0
    def __call__(
        path,
        source=None,
        dataset=None,
        recursive=False,
        recursion_limit=None,
        get_data=True,
        reckless=False,
        git_opts=None,
        annex_opts=None,
        annex_get_opts=None,
        jobs=None,
        verbose=False,
        # internal -- instead of returning 'get'ed items, return final
        # content_by_ds, unavailable_paths.  To be used by the call from
        # Install.__call__ and done so to avoid creating another reusable
        # function which would need to duplicate all this heavy list of
        # kwargs
        _return_datasets=False):
        # IMPLEMENTATION CONCEPT:
        #
        # 1. turn all input paths into absolute paths
        # 2. Sort the world into existing handles and the rest
        # 3. Try locate missing handles (obtain subdatasets along the way)
        # 4. Expand into subdatasets with recursion enables (potentially
        #    obtain even more subdatasets
        # 5. Shoot info of which handles to get in each subdataset to,
        #    git-annex, once at the very end

        # TODO: consider allowing an empty `path` argument, as with other commands,
        # to indicate CWD
        resolved_paths, dataset_path = get_normalized_path_arguments(
            path, dataset, default=None)
        if not resolved_paths:
            raise InsufficientArgumentsError(
                "`get` needs at least one path as argument")

        # sort paths into the respective datasets
        dir_lookup = {}
        content_by_ds, unavailable_paths, nondataset_paths = \
            get_paths_by_dataset(resolved_paths,
                                 recursive=recursive,
                                 recursion_limit=recursion_limit,
                                 dir_lookup=dir_lookup)
        lgr.debug(
            "Found %i existing dataset(s) to get content in "
            "and %d unavailable paths", len(content_by_ds),
            len(unavailable_paths))
        # IMPORTANT NOTE re `content_by_ds`
        # each key is a subdataset that we need to get something in
        # if the value[0] is the subdataset's path, we want all of it
        # if the value[0] == curdir, we just installed it as part of
        # resolving file handles and we did not say anything but "give
        # me the dataset handle"

        # explore the unknown
        for path in sorted(unavailable_paths):
            # how close can we get?
            dspath = GitRepo.get_toppath(path)
            if dspath is None:
                # nothing we can do for this path
                continue
            ds = Dataset(dspath)
            # must always yield a dataset -- we sorted out the ones outside
            # any dataset at the very top
            assert ds.is_installed()
            # now actually obtain whatever is necessary to get to this path
            containing_ds = install_necessary_subdatasets(ds, path, reckless)
            if containing_ds.path != ds.path:
                lgr.debug(
                    "Installed %s to fulfill request for content for "
                    "path %s", containing_ds, path)
                # mark resulting dataset as auto-installed
                if containing_ds.path == path:
                    # we had to get the entire dataset, not something within
                    # mark that it just appeared
                    content_by_ds[path] = [curdir]
                else:
                    # we need to get content within
                    content_by_ds[path] = [path]

        if recursive and not recursion_limit == 'existing':
            # obtain any subdatasets underneath the paths given inside the
            # subdatasets that we know already exist
            # unless we do not want recursion into not-yet-installed datasets
            for subdspath in sorted(content_by_ds.keys()):
                for content_path in content_by_ds[subdspath]:
                    if not isdir(content_path):
                        # a non-directory cannot have content underneath
                        continue
                    subds = Dataset(subdspath)
                    lgr.info(
                        "Obtaining %s %s recursively", subds,
                        ("underneath %s" %
                         content_path if subds.path != content_path else ""))
                    cbysubds = _recursive_install_subds_underneath(
                        subds,
                        # `content_path` was explicitly given as input
                        # we count recursions from the input, hence we
                        # can start with the full number
                        recursion_limit,
                        reckless,
                        # protect against magic marker misinterpretation
                        # only relevant for _get, hence replace here
                        start=content_path if content_path != curdir else None)
                    # gets file content for all freshly installed subdatasets
                    content_by_ds.update(cbysubds)

        ## we have now done everything we could to obtain whatever subdataset
        ## to get something on the file system for previously unavailable paths
        ## check and sort one last
        content_by_ds, unavailable_paths, nondataset_paths2 = \
            get_paths_by_dataset(
                unavailable_paths,
                recursive=recursive,
                recursion_limit=recursion_limit,
                out=content_by_ds,
                dir_lookup=dir_lookup)

        nondataset_paths.extend(nondataset_paths2)
        if nondataset_paths:
            lgr.warning("ignored paths that do not belong to any dataset: %s",
                        nondataset_paths)

        if unavailable_paths:
            lgr.warning('ignored non-existing paths: %s', unavailable_paths)

        # hand over to git-annex
        results = list(
            chain.from_iterable(
                _get(content_by_ds,
                     refpath=dataset_path,
                     source=source,
                     jobs=jobs,
                     get_data=get_data)))
        # ??? should we in _return_datasets case just return both content_by_ds
        # and unavailable_paths may be so we provide consistent across runs output
        # and then issue outside similar IncompleteResultsError?
        if unavailable_paths:  # and likely other error flags
            if _return_datasets:
                results = sorted(
                    set(content_by_ds).difference(unavailable_paths))
            raise IncompleteResultsError(results, failed=unavailable_paths)
        else:
            return sorted(content_by_ds) if _return_datasets else results
Beispiel #7
0
    def __call__(
            path=None,
            dataset=None,
            to=None,
            since=None,
            skip_failing=False,
            recursive=False,
            recursion_limit=None,
            git_opts=None,
            annex_opts=None,
            annex_copy_opts=None):
        # shortcut
        ds = require_dataset(dataset, check_installed=True, purpose='publication')
        assert(ds.repo is not None)

        path = assure_list(path)

        # figure out, what to publish from what (sub)dataset:
        publish_this = False   # whether to publish `ds`
        publish_files = []     # which files to publish by `ds`

        expl_subs = set()      # subdatasets to publish explicitly
        publish_subs = dict()  # collect what to publish from subdatasets

        if not path:
            # publish `ds` itself, if nothing else is given:
            publish_this = True
        else:
            for p in path:
                subdatasets = ds.get_subdatasets()
                if p in subdatasets:
                    # p is a subdataset, that needs to be published itself
                    expl_subs.add(p)
                else:
                    try:
                        d = ds.get_containing_subdataset(p)
                    except ValueError as e:
                        # p is not in ds => skip:
                        lgr.warning(str(e) + " - Skipped.")
                        continue
                    if d == ds:
                        # p needs to be published from ds
                        publish_this = True
                        publish_files.append(p)
                    else:
                        # p belongs to subds `d`
                        if not publish_subs[d.path]:
                            publish_subs[d.path] = dict()
                        if not publish_subs[d.d.path]['files']:
                            publish_subs[d.d.path]['files'] = list()
                        publish_subs[d.path]['dataset'] = d
                        publish_subs[d.path]['files'].append(p)

        if publish_this:
            # Note: we need an upstream remote, if there's none given. We could
            # wait for git push to complain, but we need to explicitly figure it
            # out for pushing annex branch anyway and we might as well fail
            # right here.

            track_remote, track_branch = None, None

            # keep `to` in case it's None for passing to recursive calls:
            dest_resolved = to
            if to is None:
                # TODO: If possible, avoid resolution herein and rely on git
                # (or GitRepo respectively), meaning: Just pass `None`
                # ATM conflicts with _get_changed_datasets => figure it out

                track_remote, track_branch = ds.repo.get_tracking_branch()
                if track_remote:
                    dest_resolved = track_remote
                else:
                    # we have no remote given and no upstream => fail
                    raise InsufficientArgumentsError(
                        "No known default target for "
                        "publication and none given.")

        subds_prev_hexsha = {}
        if recursive:
            all_subdatasets = ds.get_subdatasets(fulfilled=True)

            # TODO: dest_resolved => to?
            # Note: This is a bug anyway, since in actual recursive call `to` is
            # passed in order to be resolved by the subdatasets themselves
            # (might be None), but when considering what subdatasets to be
            # published, we assume `dest_resolved` is the same for all of them.

            # ==> TODO: RF to consider `since` only for the current ds and then go on
            # recursively.

            subds_to_consider = \
                Publish._get_changed_datasets(
                    ds.repo, all_subdatasets, dest_resolved, since=since) \
                if publish_this \
                else all_subdatasets
            # if we were returned a dict, we got subds_prev_hexsha
            if isinstance(subds_to_consider, dict):
                subds_prev_hexsha = subds_to_consider
            for subds_path in subds_to_consider:
                if path and '.' in path:
                    # we explicitly are passing '.' to subdatasets in case of
                    # `recursive`. Therefore these datasets are going into
                    # `publish_subs`, instead of `expl_subs`:
                    sub = Dataset(opj(ds.path, subds_path))
                    publish_subs[sub.path] = dict()
                    publish_subs[sub.path]['dataset'] = sub
                    publish_subs[sub.path]['files'] = ['.']
                else:
                    # we can recursively publish only, if there actually
                    # is something
                    expl_subs.add(subds_path)

        published, skipped = [], []

        for dspath in sorted(expl_subs):
            # these datasets need to be pushed regardless of additional paths
            # pointing inside them
            # due to API, this may not happen when calling publish with paths,
            # therefore force it.
            # TODO: There might be a better solution to avoid two calls of
            # publish() on the very same Dataset instance
            ds_ = Dataset(opj(ds.path, dspath))
            try:
                # we could take local diff for the subdataset
                # but may be we could just rely on internal logic within
                # subdataset to figure out what it needs to publish.
                # But we need to pass empty string one inside as is
                pkw = {}
                if since == '':
                    pkw['since'] = since
                else:
                    # pass previous state for that submodule if known
                    pkw['since'] = subds_prev_hexsha.get(dspath, None)
                published_, skipped_ = ds_.publish(to=to, recursive=recursive, **pkw)
                published += published_
                skipped += skipped_
            except Exception as exc:
                if not skip_failing:
                    raise
                lgr.warning("Skipped %s: %s", ds.path, exc_str(exc))
                skipped += [ds_]

        for d in publish_subs:
            # recurse into subdatasets

            # TODO: need to fetch. see above
            publish_subs[d]['dataset'].repo.fetch(remote=to)

            published_, skipped_ = publish_subs[d]['dataset'].publish(
                to=to,
                path=publish_subs[d]['files'],
                recursive=recursive,
                annex_copy_opts=annex_copy_opts)
            published += published_
            skipped += skipped_

        if publish_this:

            # is `to` an already known remote?
            if dest_resolved not in ds.repo.get_remotes():
                # unknown remote
                raise ValueError("No sibling '{0}' found for {1}."
                                 "".format(dest_resolved, ds))

            # in order to be able to use git's config to determine what to push,
            # we need to annex merge first. Otherwise a git push might be
            # rejected if involving all matching branches for example.
            # Once at it, also push the annex branch right here.

            # Q: Do we need to respect annex-ignore here? Does it make sense to
            # publish to a remote without pushing the annex branch
            # (if there is any)?
            if isinstance(ds.repo, AnnexRepo):
                ds.repo.fetch(remote=dest_resolved)
                ds.repo.merge_annex(dest_resolved)
                _log_push_info(ds.repo.push(remote=dest_resolved,
                                            refspec="git-annex:git-annex"))

            # upstream branch needed for update (merge) and subsequent push,
            # in case there is no.
            # no tracking branch yet?
            set_upstream = track_branch is None

            # publishing of `dest_resolved` might depend on publishing other
            # remote(s) first:
            # define config var name for potential publication dependencies
            depvar = 'remote.{}.datalad-publish-depends'.format(dest_resolved)
            for d in ds.config.get(depvar, []):
                lgr.info("Dependency detected: '%s'" % d)
                # Note: Additional info on publishing the dep. comes from within
                # `ds.publish`.
                ds.publish(path=path,
                           to=d,
                           since=since,
                           skip_failing=skip_failing,
                           recursive=recursive,
                           recursion_limit=recursion_limit,
                           git_opts=git_opts,
                           annex_opts=annex_opts,
                           annex_copy_opts=annex_copy_opts)

            lgr.info("Publishing {0} to {1}".format(ds, dest_resolved))

            # we now know where to push to:
            # TODO: what to push? default: git push --mirror if nothing configured?
            # consider also: --follow-tags, --tags, --atomic

            # Note: git's push.default is 'matching', which possibly doesn't
            # work for first
            # time publication (a branch, that doesn't exist on remote yet)
            # But if we want to respect remote.*.push entries, etc. we need to
            # not pass a specific refspec (like active branch) to `git push`
            # by default.

            _log_push_info(ds.repo.push(remote=dest_resolved,
                                        refspec=ds.repo.get_active_branch(),
                                        set_upstream=set_upstream))

            published.append(ds)

            if publish_files or annex_copy_opts:
                if not isinstance(ds.repo, AnnexRepo):
                    # incomplete, since `git push` was done already:
                    raise IncompleteResultsError(
                        (published, skipped),
                        failed=publish_files,
                        msg="Cannot publish content of something, that is not "
                            "an annex. ({0})".format(ds))
                if ds.config.get('remote.{}.annex-ignore', False):
                    # Q: Do we need a --force option here? annex allows to
                    # ignore the ignore setting
                    raise IncompleteResultsError(
                        (published, skipped),
                        failed=publish_files,
                        msg="Sibling '{0}' of {1} is configured to be ignored "
                            "by annex. No content was published."
                            % (dest_resolved, ds))

                lgr.info("Publishing data of dataset {0} ...".format(ds))
                published += ds.repo.copy_to(files=publish_files,
                                             remote=dest_resolved,
                                             options=annex_copy_opts)

        return published, skipped