Example #1
0
def _describe_extensions():
    infos = {}
    from datalad.support.entrypoints import iter_entrypoints
    from importlib import import_module

    for ename, emod, eload in iter_entrypoints('datalad.extensions'):
        info = {}
        infos[ename] = info
        try:
            ext = eload()
            info['load_error'] = None
            info['description'] = ext[0]
            info['module'] = emod
            mod = import_module(emod, package='datalad')
            info['version'] = getattr(mod, '__version__', None)
        except Exception as e:
            ce = CapturedException(e)
            info['load_error'] = ce.format_short()
            continue
        info['entrypoints'] = entry_points = {}
        for ep in ext[1]:
            ep_info = {
                'module': ep[0],
                'class': ep[1],
                'names': ep[2:],
            }
            entry_points['{}.{}'.format(*ep[:2])] = ep_info
            try:
                import_module(ep[0], package='datalad')
                ep_info['load_error'] = None
            except Exception as e:
                ce = CapturedException(e)
                ep_info['load_error'] = ce.format_short()
                continue
    return infos
Example #2
0
def _describe_metadata_elements(group):
    infos = {}
    from datalad.support.entrypoints import iter_entrypoints
    from importlib import import_module
    if sys.version_info < (3, 10):
        # 3.10 is when it was no longer provisional
        from importlib_metadata import distribution
    else:
        from importlib.metadata import distribution

    for ename, emod, eload in iter_entrypoints(group):
        info = {}
        infos[f'{ename}'] = info
        try:
            info['module'] = emod
            dist = distribution(emod.split('.', maxsplit=1)[0])
            info['distribution'] = f'{dist.name} {dist.version}'
            mod = import_module(emod, package='datalad')
            version = getattr(mod, '__version__', None)
            if version:
                # no not clutter the report with no version
                info['version'] = version
            eload()
            info['load_error'] = None
        except Exception as e:
            ce = CapturedException(e)
            info['load_error'] = ce.format_short()
            continue
    return infos
Example #3
0
def add_entrypoints_to_interface_groups(interface_groups):
    from datalad.support.entrypoints import iter_entrypoints
    for name, _, spec in iter_entrypoints('datalad.extensions', load=True):
        if len(spec) < 2 or not spec[1]:
            # entrypoint identity was logged by the iterator already
            lgr.debug('Extension does not provide a command suite')
            continue
        interface_groups.append((name, spec[0], spec[1]))
Example #4
0
def check_api(annex, path):
    ds = Dataset(path).create(force=True, annex=annex)
    ds.save()
    assert_repo_status(ds.path)

    processed_extractors, skipped_extractors = [], []
    for ename, emod, eload in iter_entrypoints('datalad.metadata.extractors'):
        # we need to be able to query for metadata, even if there is none
        # from any extractor
        try:
            extractor_cls = eload()
        except Exception as exc:
            exc_ = str(exc)
            skipped_extractors += [exc_]
            continue
        extractor = extractor_cls(ds, paths=['file.dat'])
        meta = extractor.get_metadata(dataset=True, content=True)
        # we also get something for the dataset and something for the content
        # even if any of the two is empty
        assert_equal(len(meta), 2)
        dsmeta, contentmeta = meta
        assert (isinstance(dsmeta, dict))
        assert hasattr(contentmeta, '__len__') or isgenerator(contentmeta)
        # verify that generator does not blow and has an entry for our
        # precious file
        cm = dict(contentmeta)
        # datalad_core does provide some (not really) information about our
        # precious file
        if ename == 'datalad_core':
            assert 'file.dat' in cm
        elif ename == 'annex':
            if annex:
                # verify correct key, which is the same for all files of 0 size
                assert_equal(cm['file.dat']['key'],
                             'MD5E-s0--d41d8cd98f00b204e9800998ecf8427e.dat')
            else:
                # no metadata on that file
                assert not cm
        processed_extractors.append(ename)
    assert "datalad_core" in processed_extractors, \
        "Should have managed to find at least the core extractor extractor"
    if skipped_extractors:
        raise SkipTest(
            "Not fully tested/succeeded since some extractors failed"
            " to load:\n%s" % ("\n".join(skipped_extractors)))
Example #5
0
def _generate_extension_api():
    """Auto detect all available extensions and generate an API from them
    """
    from datalad.support.entrypoints import iter_entrypoints
    from datalad.interface.base import (
        get_api_name,
        load_interface,
    )

    import logging
    lgr = logging.getLogger('datalad.api')

    for ename, _, (grp_descr,
                   interfaces) in iter_entrypoints('datalad.extensions',
                                                   load=True):
        for intfspec in interfaces:
            # turn the interface spec into an instance
            intf = load_interface(intfspec[:2])
            api_name = get_api_name(intfspec)
            if api_name in globals():
                lgr.debug(
                    'Command %s from extension %s is replacing a previously loaded implementation',
                    api_name, ename)
            globals()[api_name] = intf.__call__
Example #6
0
def _get_procedure_implementation(name='*', ds=None):
    """get potential procedures: path, name, configuration, and a help message

    The order of consideration is user-level, system-level, extra locations, dataset,
    datalad extensions, datalad. Therefore local definitions/configurations take
    precedence over ones, that come from outside (via a datalad-extension or a
    dataset with its .datalad/config). If a dataset had precedence (as it was
    before), the addition (or just an update) of a (sub-)dataset would otherwise
    surprisingly cause you to execute code different from what you defined
    within ~/.gitconfig or your local repository's .git/config.
    So, local definitions take precedence over remote ones and more specific
    ones over more general ones.

    Yields
    ------
    tuple
      path, name, format string, help message
    """

    ds = ds if isinstance(ds, Dataset) else Dataset(ds) if ds else None

    # 1. check system and user account for procedure
    for loc in (cfg.obtain('datalad.locations.user-procedures'),
                cfg.obtain('datalad.locations.system-procedures'),
                cfg.get('datalad.locations.extra-procedures', get_all=True)):
        for dir in ensure_list(loc):
            for m, n in _get_file_match(dir, name):
                yield (
                    m,
                    n,
                ) + _get_proc_config(n)
    # 2. check dataset for procedure
    if ds is not None and ds.is_installed():
        # could be more than one
        dirs = ensure_list(
            ds.config.obtain('datalad.locations.dataset-procedures'))
        for dir in dirs:
            # TODO `get` dirs if necessary
            for m, n in _get_file_match(op.join(ds.path, dir), name):
                yield (
                    m,
                    n,
                ) + _get_proc_config(n, ds=ds)
        # 2.1. check subdatasets recursively
        for subds in ds.subdatasets(return_type='generator',
                                    result_xfm='datasets'):
            for m, n, f, h in _get_procedure_implementation(name=name,
                                                            ds=subds):
                yield m, n, f, h

    # 3. check extensions for procedure
    from datalad.support.entrypoints import iter_entrypoints
    # delay heavy import until here
    from pkg_resources import (
        resource_filename,
        resource_isdir,
    )
    for epname, epmodule, _ in iter_entrypoints('datalad.extensions'):
        # use of '/' here is OK wrt to platform compatibility
        if resource_isdir(epmodule, 'resources/procedures'):
            for m, n in _get_file_match(
                    resource_filename(epmodule, 'resources/procedures'), name):
                yield (
                    m,
                    n,
                ) + _get_proc_config(n)
    # 4. at last check datalad itself for procedure
    for m, n in _get_file_match(
            resource_filename('datalad', 'resources/procedures'), name):
        yield (
            m,
            n,
        ) + _get_proc_config(n)
Example #7
0
def _get_metadata(ds, types, global_meta=None, content_meta=None, paths=None):
    """Make a direct query of a dataset to extract its metadata.

    Parameters
    ----------
    ds : Dataset
    types : list
    """
    errored = False
    dsmeta = dict()
    contentmeta = {}

    if global_meta is not None and content_meta is not None and \
            not global_meta and not content_meta:
        # both are false and not just none
        return dsmeta, contentmeta, errored

    context = {
        '@vocab':
        'http://docs.datalad.org/schema_v{}.json'.format(vocabulary_version)
    }

    fullpathlist = paths
    if paths and isinstance(ds.repo, AnnexRepo):
        # Ugly? Jep: #2055
        content_info = zip(paths, ds.repo.file_has_content(paths),
                           ds.repo.is_under_annex(paths))
        paths = [p for p, c, a in content_info if not a or c]
        nocontent = len(fullpathlist) - len(paths)
        if nocontent:
            # TODO better fail, or support incremental and label this file as no present
            lgr.warning('{} files have no content present, '
                        'some extractors will not operate on {}'.format(
                            nocontent, 'them' if nocontent > 10 else
                            [p for p, c, a in content_info if not c and a]))

    # pull out potential metadata field blacklist config settings
    blacklist = [
        re.compile(bl) for bl in ensure_list(
            ds.config.obtain('datalad.metadata.aggregate-ignore-fields',
                             default=[]))
    ]
    # enforce size limits
    max_fieldsize = ds.config.obtain('datalad.metadata.maxfieldsize')
    # keep local, who knows what some extractors might pull in
    from datalad.support.entrypoints import iter_entrypoints
    extractors = {
        ename: eload
        for ename, _, eload in iter_entrypoints('datalad.metadata.extractors')
    }

    # we said that we want to fail, rather then just moan about less metadata
    # Do an early check if all extractors are available so not to wait hours
    # and then crash for some obvious reason
    absent_extractors = [t for t in types if t not in extractors]
    if absent_extractors:
        raise ValueError(
            '%d enabled metadata extractor%s not available in this installation'
            ': %s' % (len(absent_extractors),
                      single_or_plural(" is", "s are", len(absent_extractors)),
                      ', '.join(absent_extractors)))

    log_progress(
        lgr.info,
        'metadataextractors',
        'Start metadata extraction from %s',
        ds,
        total=len(types),
        label='Metadata extraction',
        unit=' extractors',
    )
    for mtype in types:
        mtype_key = mtype
        log_progress(lgr.info,
                     'metadataextractors',
                     'Engage %s metadata extractor',
                     mtype_key,
                     update=1,
                     increment=True)
        try:
            extractor_cls = extractors[mtype_key]()
            extractor = extractor_cls(
                ds,
                paths=paths if extractor_cls.NEEDS_CONTENT else fullpathlist)
        except Exception as e:
            log_progress(
                lgr.error,
                'metadataextractors',
                'Failed %s metadata extraction from %s',
                mtype_key,
                ds,
            )
            raise ValueError("Failed to load metadata extractor for '%s', "
                             "broken dataset configuration (%s)?" %
                             (mtype, ds)) from e
        try:
            dsmeta_t, contentmeta_t = extractor.get_metadata(
                dataset=global_meta
                if global_meta is not None else ds.config.obtain(
                    'datalad.metadata.aggregate-dataset-{}'.format(
                        mtype.replace('_', '-')),
                    default=True,
                    valtype=EnsureBool()),
                content=content_meta
                if content_meta is not None else ds.config.obtain(
                    'datalad.metadata.aggregate-content-{}'.format(
                        mtype.replace('_', '-')),
                    default=True,
                    valtype=EnsureBool()))
        except Exception as e:
            lgr.error('Failed to get dataset metadata (%s): %s', mtype,
                      CapturedException(e))
            if cfg.get('datalad.runtime.raiseonerror'):
                log_progress(
                    lgr.error,
                    'metadataextractors',
                    'Failed %s metadata extraction from %s',
                    mtype_key,
                    ds,
                )
                raise
            errored = True
            # if we dont get global metadata we do not want content metadata
            continue

        if dsmeta_t:
            if _ok_metadata(dsmeta_t, mtype, ds, None):
                dsmeta_t = _filter_metadata_fields(dsmeta_t,
                                                   maxsize=max_fieldsize,
                                                   blacklist=blacklist)
                dsmeta[mtype_key] = dsmeta_t
            else:
                errored = True

        unique_cm = {}
        extractor_unique_exclude = getattr(extractor_cls, "_unique_exclude",
                                           set())
        # TODO: ATM neuroimaging extractors all provide their own internal
        #  log_progress but if they are all generators, we could provide generic
        #  handling of the progress here.  Note also that log message is actually
        #  seems to be ignored and not used, only the label ;-)
        # log_progress(
        #     lgr.debug,
        #     'metadataextractors_loc',
        #     'Metadata extraction per location for %s', mtype,
        #     # contentmeta_t is a generator... so no count is known
        #     # total=len(contentmeta_t or []),
        #     label='Metadata extraction per location',
        #     unit=' locations',
        # )
        for loc, meta in contentmeta_t or {}:
            lgr.log(5, "Analyzing metadata for %s", loc)
            # log_progress(
            #     lgr.debug,
            #     'metadataextractors_loc',
            #     'ignoredatm',
            #     label=loc,
            #     update=1,
            #     increment=True)
            if not _ok_metadata(meta, mtype, ds, loc):
                errored = True
                # log_progress(
                #     lgr.debug,
                #     'metadataextractors_loc',
                #     'ignoredatm',
                #     label='Failed for %s' % loc,
                # )
                continue
            # we also want to store info that there was no metadata(e.g. to get a list of
            # files that have no metadata)
            # if there is an issue that a extractor needlessly produces empty records, the
            # extractor should be fixed and not a general switch. For example the datalad_core
            # issues empty records to document the presence of a file
            #elif not meta:
            #    continue

            # apply filters
            meta = _filter_metadata_fields(meta,
                                           maxsize=max_fieldsize,
                                           blacklist=blacklist)

            if not meta:
                continue

            # assign
            # only ask each metadata extractor once, hence no conflict possible
            loc_dict = contentmeta.get(loc, {})
            loc_dict[mtype_key] = meta
            contentmeta[loc] = loc_dict

            if ds.config.obtain('datalad.metadata.generate-unique-{}'.format(
                    mtype_key.replace('_', '-')),
                                default=True,
                                valtype=EnsureBool()):
                # go through content metadata and inject report of unique keys
                # and values into `dsmeta`
                for k, v in meta.items():
                    if k in dsmeta.get(mtype_key, {}):
                        # if the dataset already has a dedicated idea
                        # about a key, we skip it from the unique list
                        # the point of the list is to make missing info about
                        # content known in the dataset, not to blindly
                        # duplicate metadata. Example: list of samples data
                        # were recorded from. If the dataset has such under
                        # a 'sample' key, we should prefer that, over an
                        # aggregated list of a hopefully-kinda-ok structure
                        continue
                    elif k in extractor_unique_exclude:
                        # the extractor thinks this key is worthless for the purpose
                        # of discovering whole datasets
                        # we keep the key (so we know that some file is providing this key),
                        # but ignore any value it came with
                        unique_cm[k] = None
                        continue
                    vset = unique_cm.get(k, set())
                    vset.add(_val2hashable(v))
                    unique_cm[k] = vset

        # log_progress(
        #     lgr.debug,
        #     'metadataextractors_loc',
        #     'Finished metadata extraction across locations for %s', mtype)

        if unique_cm:
            # per source storage here too
            ucp = dsmeta.get('datalad_unique_content_properties', {})

            # important: we want to have a stable order regarding
            # the unique values (a list). we cannot guarantee the
            # same order of discovery, hence even when not using a
            # set above we would still need sorting. the callenge
            # is that any value can be an arbitrarily complex nested
            # beast
            # we also want to have each unique value set always come
            # in a top-level list, so we known if some unique value
            # was a list, os opposed to a list of unique values

            def _ensure_serializable(val):
                if isinstance(val, ReadOnlyDict):
                    return {k: _ensure_serializable(v) for k, v in val.items()}
                if isinstance(val, (tuple, list)):
                    return [_ensure_serializable(v) for v in val]
                else:
                    return val

            ucp[mtype_key] = {
                k: [
                    _ensure_serializable(i)
                    for i in sorted(v, key=_unique_value_key)
                ] if v is not None else None
                for k, v in unique_cm.items()
                # v == None (disable unique, but there was a value at some point)
                # otherwise we only want actual values, and also no single-item-lists
                # of a non-value
                # those contribute no information, but bloat the operation
                # (inflated number of keys, inflated storage, inflated search index, ...)
                if v is None or (v and not v == {''})
            }
            dsmeta['datalad_unique_content_properties'] = ucp

    log_progress(
        lgr.info,
        'metadataextractors',
        'Finished metadata extraction from %s',
        ds,
    )

    # always identify the effective vocabulary - JSON-LD style
    if context:
        dsmeta['@context'] = context

    return dsmeta, contentmeta, errored