Python Dataset.Dataset Exemples, datalad.distribution.dataset.Dataset.Dataset Python Exemples

Exemple #1

0

Afficher le fichier

def test_get_content_info_dotgit(path=None):
    ds = Dataset(path).create()
    # Files in .git/ won't be reported, though this takes a kludge on our side
    # before Git 2.25.
    assert_false(ds.repo.get_content_info(paths=[op.join(".git", "config")]))

Exemple #2

0

Afficher le fichier

Fichier : test_save.py Projet : nicholsn/datalad

def test_save_obscure_name(path):
    ds = Dataset(path).create(force=True)
    fname = OBSCURE_FILENAME
    # Just check that we don't fail with a unicode error.
    with swallow_outputs():
        ds.save(path=fname, result_renderer="default")

Exemple #3

0

Afficher le fichier

    def eval_func(wrapped, instance, args, kwargs):
        lgr.log(2, "Entered eval_func for %s", func)
        # for result filters
        # we need to produce a dict with argname/argvalue pairs for all args
        # incl. defaults and args given as positionals
        allkwargs = get_allargs_as_kwargs(wrapped, args, kwargs)

        # determine the command class associated with `wrapped`
        wrapped_class = get_wrapped_class(wrapped)

        # retrieve common options from kwargs, and fall back on the command
        # class attributes, or general defaults if needed
        kwargs = kwargs.copy()  # we will pop, which might cause side-effect
        common_params = {
            p_name: kwargs.pop(
                # go with any explicitly given default
                p_name,
                # otherwise determine the command class and pull any
                # default set in that class
                getattr(
                    wrapped_class,
                    p_name,
                    # or the common default
                    eval_defaults[p_name]))
            for p_name in eval_params
        }

        # short cuts and configured setup for common options
        return_type = common_params['return_type']
        result_filter = get_result_filter(common_params['result_filter'])
        # resolve string labels for transformers too
        result_xfm = known_result_xfms.get(
            common_params['result_xfm'],
            # use verbatim, if not a known label
            common_params['result_xfm'])
        result_renderer = common_params['result_renderer']
        # TODO remove this conditional branch entirely, done outside
        if not result_renderer:
            result_renderer = dlcfg.get('datalad.api.result-renderer', None)
        # look for potential override of logging behavior
        result_log_level = dlcfg.get('datalad.log.result-level', None)

        # query cfg for defaults
        # .is_installed and .config can be costly, so ensure we do
        # it only once. See https://github.com/datalad/datalad/issues/3575
        dataset_arg = allkwargs.get('dataset', None)
        from datalad.distribution.dataset import Dataset
        ds = dataset_arg if isinstance(dataset_arg, Dataset) \
            else Dataset(dataset_arg) if dataset_arg else None
        # do not reuse a dataset's existing config manager here
        # they are configured to read the committed dataset configuration
        # too. That means a datalad update can silently bring in new
        # procedure definitions from the outside, and in some sense enable
        # remote code execution by a 3rd-party
        # To avoid that, create a new config manager that only reads local
        # config (system and .git/config), plus any overrides given to this
        # datalad session
        proc_cfg = ConfigManager(
            ds, source='local',
            overrides=dlcfg.overrides) if ds and ds.is_installed() else dlcfg

        # look for hooks
        hooks = get_jsonhooks_from_config(proc_cfg)

        # this internal helper function actually drives the command
        # generator-style, it may generate an exception if desired,
        # on incomplete results
        def generator_func(*_args, **_kwargs):
            # flag whether to raise an exception
            incomplete_results = []
            # track what actions were performed how many times
            action_summary = {}

            # if a custom summary is to be provided, collect the results
            # of the command execution
            results = []
            do_custom_result_summary = result_renderer in ('tailored', 'default') \
                and hasattr(wrapped_class, 'custom_result_summary_renderer')

            # process main results
            for r in _process_results(
                    # execution
                    wrapped(*_args, **_kwargs),
                    wrapped_class,
                    common_params['on_failure'],
                    # bookkeeping
                    action_summary,
                    incomplete_results,
                    # communication
                    result_renderer,
                    result_log_level,
                    # let renderers get to see how a command was called
                    allkwargs):
                for hook, spec in hooks.items():
                    # run the hooks before we yield the result
                    # this ensures that they are executed before
                    # a potentially wrapper command gets to act
                    # on them
                    if match_jsonhook2result(hook, r, spec['match']):
                        lgr.debug('Result %s matches hook %s', r, hook)
                        # a hook is also a command that yields results
                        # so yield them outside too
                        # users need to pay attention to void infinite
                        # loops, i.e. when a hook yields a result that
                        # triggers that same hook again
                        for hr in run_jsonhook(hook, spec, r, dataset_arg):
                            # apply same logic as for main results, otherwise
                            # any filters would only tackle the primary results
                            # and a mixture of return values could happen
                            if not keep_result(hr, result_filter, **allkwargs):
                                continue
                            hr = xfm_result(hr, result_xfm)
                            # rationale for conditional is a few lines down
                            if hr:
                                yield hr
                if not keep_result(r, result_filter, **allkwargs):
                    continue
                r = xfm_result(r, result_xfm)
                # in case the result_xfm decided to not give us anything
                # exclude it from the results. There is no particular reason
                # to do so other than that it was established behavior when
                # this comment was written. This will not affect any real
                # result record
                if r:
                    yield r

                # collect if summary is desired
                if do_custom_result_summary:
                    results.append(r)

            # result summary before a potential exception
            # custom first
            if do_custom_result_summary:
                wrapped_class.custom_result_summary_renderer(results)
            elif result_renderer == 'default' and action_summary and \
                    sum(sum(s.values()) for s in action_summary.values()) > 1:
                # give a summary in default mode, when there was more than one
                # action performed
                ui.message("action summary:\n  {}".format('\n  '.join(
                    '{} ({})'.format(
                        act, ', '.join(
                            '{}: {}'.format(status, action_summary[act]
                                            [status])
                            for status in sorted(action_summary[act])))
                    for act in sorted(action_summary))))

            if incomplete_results:
                raise IncompleteResultsError(
                    failed=incomplete_results,
                    msg="Command did not complete successfully")

        if return_type == 'generator':
            # hand over the generator
            lgr.log(2, "Returning generator_func from eval_func for %s",
                    wrapped_class)
            return generator_func(*args, **kwargs)
        else:

            @wrapt.decorator
            def return_func(wrapped_, instance_, args_, kwargs_):
                results = wrapped_(*args_, **kwargs_)
                if inspect.isgenerator(results):
                    # unwind generator if there is one, this actually runs
                    # any processing
                    results = list(results)
                # render summaries
                if not result_xfm and result_renderer in ('tailored',
                                                          'default'):
                    # cannot render transformed results
                    if hasattr(wrapped_class,
                               'custom_result_summary_renderer'):
                        wrapped_class.custom_result_summary_renderer(results)
                if return_type == 'item-or-list' and \
                        len(results) < 2:
                    return results[0] if results else None
                else:
                    return results

            lgr.log(2, "Returning return_func from eval_func for %s",
                    wrapped_class)
            return return_func(generator_func)(*args, **kwargs)

Exemple #4

0

Afficher le fichier

def test_get_cached_dataset(cache_dir):

    # patch DATALAD_TESTS_CACHE to not use the actual cache with
    # the test testing that very cache.
    cache_dir = Path(cache_dir)

    # store file-based values for testrepo-minimalds for readability:
    annexed_file = opj('inannex', 'animated.gif')
    annexed_file_key = "MD5E-s144625--4c458c62b7ac8ec8e19c8ff14b2e34ad.gif"

    with patch(CACHE_PATCH_STR, new=cache_dir):

        # tuples to test (url, version, keys, class):
        test_cases = [

            # a simple testrepo
            ("https://github.com/datalad/testrepo--minimalds",
             "541cf855d13c2a338ff2803d4488daf0035e568f",
             None,
             AnnexRepo),
            # Same repo, but request paths to be present. This should work
            # with a subsequent call, although the first one did not already
            # request any:
            ("https://github.com/datalad/testrepo--minimalds",
             "9dd8b56cc706ab56185f2ceb75fbe9de9b606724",
             annexed_file_key,
             AnnexRepo),
            # Same repo again, but invalid version
            ("https://github.com/datalad/testrepo--minimalds",
             "nonexistent",
             "irrelevantkey",  # invalid version; don't even try to get the key
             AnnexRepo),
            # same thing with different name should be treated as a new thing:
            ("https://github.com/datalad/testrepo--minimalds",
             "git-annex",
             None,
             AnnexRepo),
            # try a plain git repo to make sure we can deal with that:
            # Note, that we first need a test case w/o a `key` parameter to not
            # blow up the test when Clone is patched, resulting in a MagicMock
            # instead of a Dataset instance within get_cached_dataset. In the
            # second case it's already cached then, so the patched Clone is
            # never executed.
            ("https://github.com/datalad/datalad.org",
             None,
             None,
             GitRepo),
            ("https://github.com/datalad/datalad.org",
             "gh-pages",
             "ignored-key",  # it's a git repo; don't even try to get a key
             GitRepo),

        ]
        for url, version, keys, cls in test_cases:
            target = cache_dir / url2filename(url)

            # assuming it doesn't exist yet - patched cache dir!
            in_cache_before = target.exists()
            with patch(CLONE_PATCH_STR) as exec_clone:
                try:
                    ds = get_cached_dataset(url, version, keys)
                    invalid_version = False
                except AssertionError:
                    # should happen only if `version` wasn't found. Implies
                    # that the dataset exists in cache (although not returned
                    # due to exception)
                    assert_true(version)
                    assert_false(Dataset(target).repo.commit_exists(version))
                    # mark for later assertions (most of them should still hold
                    # true)
                    invalid_version = True

            assert_equal(exec_clone.call_count, 0 if in_cache_before else 1)

            # Patch prevents actual execution. Now do it for real. Note, that
            # this might be necessary for content retrieval even if dataset was
            # in cache before.
            try:
                ds = get_cached_dataset(url, version, keys)
            except AssertionError:
                # see previous call
                assert_true(invalid_version)

            assert_is_instance(ds, Dataset)
            assert_true(ds.is_installed())
            assert_equal(target, ds.pathobj)
            assert_is_instance(ds.repo, cls)

            if keys and not invalid_version and \
                    AnnexRepo.is_valid_repo(ds.path):
                # Note: it's not supposed to get that content if passed
                # `version` wasn't available. get_cached_dataset would then
                # raise before and not download anything only to raise
                # afterwards.
                here = ds.config.get("annex.uuid")
                where = ds.repo.whereis(ensure_list(keys), key=True)
                assert_true(all(here in remotes for remotes in where))

            # version check. Note, that all `get_cached_dataset` is supposed to
            # do, is verifying, that specified version exists - NOT check it
            # out"
            if version and not invalid_version:
                assert_true(ds.repo.commit_exists(version))

            # re-execution
            with patch(CLONE_PATCH_STR) as exec_clone:
                try:
                    ds2 = get_cached_dataset(url, version, keys)
                except AssertionError:
                    assert_true(invalid_version)
            exec_clone.assert_not_called()
            # returns the same Dataset as before:
            assert_is(ds, ds2)

Exemple #5

0

Afficher le fichier

Fichier : test_save.py Projet : nicholsn/datalad

 def get_baseline(p):
     ds = Dataset(p).create()
     sub = create(str(ds.pathobj / 'sub'))
     assert_repo_status(ds.path, untracked=['sub'])
     return ds

Exemple #6

0

Afficher le fichier

    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            update_mode='target',
            incremental=False,
            force_extraction=False,
            save=True):
        refds_path = Interface.get_refds_path(dataset)

        # it really doesn't work without a dataset
        ds = require_dataset(
            dataset, check_installed=True, purpose='metadata aggregation')
        path = ensure_list(path)
        if not path:
            # then current/reference dataset is "aggregated"
            # We should not add ds.path always since then --recursive would
            # also recurse current even if paths are given
            path.append(ds.path)

        agginfo_db_location, agg_base_path = get_ds_aggregate_db_locations(
            ds,
            # do not warn here, next call triggers the same warning
            warn_absent=False)
        agginfo_db = load_ds_aggregate_db(ds, abspath=True)

        to_save = []
        to_aggregate = set()
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='aggregate_metadata',
                # uninstalled subdatasets could be queried via aggregated metadata
                # -> no 'error'
                unavailable_path_status='',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            ap_type = ap.get('type', None)
            ap_state = ap.get('state', None)
            assert('parentds' in ap or ap_type == 'dataset')
            if ap_type == 'dataset' and ap_state != 'absent':
                # a present dataset, we can take directly from it
                aggsrc = ap['path']
                lgr.info('Aggregate metadata for dataset %s', aggsrc)
            else:
                # everything else needs to come from the parent
                aggsrc = ap['parentds']
                if ap_state == 'absent':
                    lgr.info(
                        'Attempt to use pre-aggregate metadata for absent %s from dataset at %s',
                        ap['path'],
                        aggsrc)
                else:
                    lgr.info(
                        'Aggregate metadata for %s from dataset at %s',
                        ap['path'],
                        aggsrc)

            to_aggregate.add(aggsrc)

            if ap_state == 'absent':
                # key thought: recursive is done by path annotation, hence
                # once we hit an absent dataset, we are 100% certain that
                # there is nothing to recurse into on the file system
                # hence we only have to look into the aggregated metadata
                # of the last available dataset in the dataset tree edge
                #
                # if there is nothing at this path, we need to look into the
                # parentds and check if we know anything about this path
                # if we do, we need to grab all the info and objects
                # if not, we need to error
                res = _get_dsinfo_from_aggmetadata(
                    aggsrc, ap['path'], recursive, agginfo_db)
                if not isinstance(res, list):
                    yield get_status_dict(
                        status='impossible',
                        message=res,
                        action='aggregate_metadata',
                        path=ap['path'],
                        logger=lgr)
                    continue
                # cue for aggregation
                to_aggregate.update(res)
            else:
                # actually aggregate metadata for this dataset, immediately place
                # generated objects into the aggregated or reference dataset,
                # and put info into DB to get the distributed to all datasets
                # that need to be updated
                errored = _dump_extracted_metadata(
                    ds,
                    Dataset(aggsrc),
                    agginfo_db,
                    to_save,
                    force_extraction,
                    agg_base_path)
                if errored:
                    yield get_status_dict(
                        status='error',
                        message='Metadata extraction failed (see previous error message, set datalad.runtime.raiseonerror=yes to fail immediately)',
                        action='aggregate_metadata',
                        path=aggsrc,
                        logger=lgr)

        # at this point we have dumped all aggregated metadata into object files
        # somewhere, we know what needs saving, but having saved anything, and
        # we know about the states of all aggregated dataset in the DB
        # what remains to do is to update all dataset, so they have there own copy
        # of aggregated metadata and update their respective aggregate.json with
        # info on what states we just aggregated from

        # first, let's figure out what dataset need updating at all
        # get adjencency info of the dataset tree spanning the base to all leaf dataset
        # associated with the path arguments
        if update_mode == 'all':
            ds_adj = {}
            discover_dataset_trace_to_targets(
                ds.path, to_aggregate, [], ds_adj,
                # we know that to_aggregate only lists datasets, existing and
                # absent ones -- we want to aggregate all of them, either from
                # just extracted metadata, or from previously aggregated metadata
                # of the closest superdataset
                includeds=to_aggregate)
            # TODO we need to work in the info about dataset that we only got from
            # aggregated metadata, that had no trace on the file system in here!!
            subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate)
        elif update_mode == 'target':
            subtrees = {ds.path: list(agginfo_db.keys())}
        else:
            raise ValueError(
                "unknown `update_mode` '%s' for metadata aggregation", update_mode)

        # go over datasets in bottom-up fashion
        for parentds_path in sorted(subtrees, reverse=True):
            lgr.info('Update aggregate metadata in dataset at: %s', parentds_path)

            _update_ds_agginfo(
                ds.path,
                parentds_path,
                subtrees[parentds_path],
                incremental,
                agginfo_db,
                to_save)
            # update complete
            res = get_status_dict(
                status='ok',
                action='aggregate_metadata',
                path=parentds_path,
                type='dataset',
                logger=lgr)
            res.update(agginfo_db.get(parentds_path, {}))
            yield res
        #
        # save potential modifications to dataset global metadata
        #
        if not to_save:
            return
        lgr.info('Attempting to save %i files/datasets', len(to_save))
        for res in Save.__call__(
                # save does not need any pre-annotated path hints
                path=[r['path'] for r in to_save],
                dataset=refds_path,
                message='[DATALAD] Dataset aggregate metadata update',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res

Exemple #7

0

Afficher le fichier

 def __init__(self, repo):
     self._child_dataset = Dataset(repo.path)
     self._super = None
     self._super_tried = False

Exemple #8

0

Afficher le fichier