def _describe_extensions(): infos = {} from datalad.support.entrypoints import iter_entrypoints from importlib import import_module for ename, emod, eload in iter_entrypoints('datalad.extensions'): info = {} infos[ename] = info try: ext = eload() info['load_error'] = None info['description'] = ext[0] info['module'] = emod mod = import_module(emod, package='datalad') info['version'] = getattr(mod, '__version__', None) except Exception as e: ce = CapturedException(e) info['load_error'] = ce.format_short() continue info['entrypoints'] = entry_points = {} for ep in ext[1]: ep_info = { 'module': ep[0], 'class': ep[1], 'names': ep[2:], } entry_points['{}.{}'.format(*ep[:2])] = ep_info try: import_module(ep[0], package='datalad') ep_info['load_error'] = None except Exception as e: ce = CapturedException(e) ep_info['load_error'] = ce.format_short() continue return infos
def _describe_metadata_elements(group): infos = {} from datalad.support.entrypoints import iter_entrypoints from importlib import import_module if sys.version_info < (3, 10): # 3.10 is when it was no longer provisional from importlib_metadata import distribution else: from importlib.metadata import distribution for ename, emod, eload in iter_entrypoints(group): info = {} infos[f'{ename}'] = info try: info['module'] = emod dist = distribution(emod.split('.', maxsplit=1)[0]) info['distribution'] = f'{dist.name} {dist.version}' mod = import_module(emod, package='datalad') version = getattr(mod, '__version__', None) if version: # no not clutter the report with no version info['version'] = version eload() info['load_error'] = None except Exception as e: ce = CapturedException(e) info['load_error'] = ce.format_short() continue return infos
def add_entrypoints_to_interface_groups(interface_groups): from datalad.support.entrypoints import iter_entrypoints for name, _, spec in iter_entrypoints('datalad.extensions', load=True): if len(spec) < 2 or not spec[1]: # entrypoint identity was logged by the iterator already lgr.debug('Extension does not provide a command suite') continue interface_groups.append((name, spec[0], spec[1]))
def check_api(annex, path): ds = Dataset(path).create(force=True, annex=annex) ds.save() assert_repo_status(ds.path) processed_extractors, skipped_extractors = [], [] for ename, emod, eload in iter_entrypoints('datalad.metadata.extractors'): # we need to be able to query for metadata, even if there is none # from any extractor try: extractor_cls = eload() except Exception as exc: exc_ = str(exc) skipped_extractors += [exc_] continue extractor = extractor_cls(ds, paths=['file.dat']) meta = extractor.get_metadata(dataset=True, content=True) # we also get something for the dataset and something for the content # even if any of the two is empty assert_equal(len(meta), 2) dsmeta, contentmeta = meta assert (isinstance(dsmeta, dict)) assert hasattr(contentmeta, '__len__') or isgenerator(contentmeta) # verify that generator does not blow and has an entry for our # precious file cm = dict(contentmeta) # datalad_core does provide some (not really) information about our # precious file if ename == 'datalad_core': assert 'file.dat' in cm elif ename == 'annex': if annex: # verify correct key, which is the same for all files of 0 size assert_equal(cm['file.dat']['key'], 'MD5E-s0--d41d8cd98f00b204e9800998ecf8427e.dat') else: # no metadata on that file assert not cm processed_extractors.append(ename) assert "datalad_core" in processed_extractors, \ "Should have managed to find at least the core extractor extractor" if skipped_extractors: raise SkipTest( "Not fully tested/succeeded since some extractors failed" " to load:\n%s" % ("\n".join(skipped_extractors)))
def _generate_extension_api(): """Auto detect all available extensions and generate an API from them """ from datalad.support.entrypoints import iter_entrypoints from datalad.interface.base import ( get_api_name, load_interface, ) import logging lgr = logging.getLogger('datalad.api') for ename, _, (grp_descr, interfaces) in iter_entrypoints('datalad.extensions', load=True): for intfspec in interfaces: # turn the interface spec into an instance intf = load_interface(intfspec[:2]) api_name = get_api_name(intfspec) if api_name in globals(): lgr.debug( 'Command %s from extension %s is replacing a previously loaded implementation', api_name, ename) globals()[api_name] = intf.__call__
def _get_procedure_implementation(name='*', ds=None): """get potential procedures: path, name, configuration, and a help message The order of consideration is user-level, system-level, extra locations, dataset, datalad extensions, datalad. Therefore local definitions/configurations take precedence over ones, that come from outside (via a datalad-extension or a dataset with its .datalad/config). If a dataset had precedence (as it was before), the addition (or just an update) of a (sub-)dataset would otherwise surprisingly cause you to execute code different from what you defined within ~/.gitconfig or your local repository's .git/config. So, local definitions take precedence over remote ones and more specific ones over more general ones. Yields ------ tuple path, name, format string, help message """ ds = ds if isinstance(ds, Dataset) else Dataset(ds) if ds else None # 1. check system and user account for procedure for loc in (cfg.obtain('datalad.locations.user-procedures'), cfg.obtain('datalad.locations.system-procedures'), cfg.get('datalad.locations.extra-procedures', get_all=True)): for dir in ensure_list(loc): for m, n in _get_file_match(dir, name): yield ( m, n, ) + _get_proc_config(n) # 2. check dataset for procedure if ds is not None and ds.is_installed(): # could be more than one dirs = ensure_list( ds.config.obtain('datalad.locations.dataset-procedures')) for dir in dirs: # TODO `get` dirs if necessary for m, n in _get_file_match(op.join(ds.path, dir), name): yield ( m, n, ) + _get_proc_config(n, ds=ds) # 2.1. check subdatasets recursively for subds in ds.subdatasets(return_type='generator', result_xfm='datasets'): for m, n, f, h in _get_procedure_implementation(name=name, ds=subds): yield m, n, f, h # 3. check extensions for procedure from datalad.support.entrypoints import iter_entrypoints # delay heavy import until here from pkg_resources import ( resource_filename, resource_isdir, ) for epname, epmodule, _ in iter_entrypoints('datalad.extensions'): # use of '/' here is OK wrt to platform compatibility if resource_isdir(epmodule, 'resources/procedures'): for m, n in _get_file_match( resource_filename(epmodule, 'resources/procedures'), name): yield ( m, n, ) + _get_proc_config(n) # 4. at last check datalad itself for procedure for m, n in _get_file_match( resource_filename('datalad', 'resources/procedures'), name): yield ( m, n, ) + _get_proc_config(n)
def _get_metadata(ds, types, global_meta=None, content_meta=None, paths=None): """Make a direct query of a dataset to extract its metadata. Parameters ---------- ds : Dataset types : list """ errored = False dsmeta = dict() contentmeta = {} if global_meta is not None and content_meta is not None and \ not global_meta and not content_meta: # both are false and not just none return dsmeta, contentmeta, errored context = { '@vocab': 'http://docs.datalad.org/schema_v{}.json'.format(vocabulary_version) } fullpathlist = paths if paths and isinstance(ds.repo, AnnexRepo): # Ugly? Jep: #2055 content_info = zip(paths, ds.repo.file_has_content(paths), ds.repo.is_under_annex(paths)) paths = [p for p, c, a in content_info if not a or c] nocontent = len(fullpathlist) - len(paths) if nocontent: # TODO better fail, or support incremental and label this file as no present lgr.warning('{} files have no content present, ' 'some extractors will not operate on {}'.format( nocontent, 'them' if nocontent > 10 else [p for p, c, a in content_info if not c and a])) # pull out potential metadata field blacklist config settings blacklist = [ re.compile(bl) for bl in ensure_list( ds.config.obtain('datalad.metadata.aggregate-ignore-fields', default=[])) ] # enforce size limits max_fieldsize = ds.config.obtain('datalad.metadata.maxfieldsize') # keep local, who knows what some extractors might pull in from datalad.support.entrypoints import iter_entrypoints extractors = { ename: eload for ename, _, eload in iter_entrypoints('datalad.metadata.extractors') } # we said that we want to fail, rather then just moan about less metadata # Do an early check if all extractors are available so not to wait hours # and then crash for some obvious reason absent_extractors = [t for t in types if t not in extractors] if absent_extractors: raise ValueError( '%d enabled metadata extractor%s not available in this installation' ': %s' % (len(absent_extractors), single_or_plural(" is", "s are", len(absent_extractors)), ', '.join(absent_extractors))) log_progress( lgr.info, 'metadataextractors', 'Start metadata extraction from %s', ds, total=len(types), label='Metadata extraction', unit=' extractors', ) for mtype in types: mtype_key = mtype log_progress(lgr.info, 'metadataextractors', 'Engage %s metadata extractor', mtype_key, update=1, increment=True) try: extractor_cls = extractors[mtype_key]() extractor = extractor_cls( ds, paths=paths if extractor_cls.NEEDS_CONTENT else fullpathlist) except Exception as e: log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', mtype_key, ds, ) raise ValueError("Failed to load metadata extractor for '%s', " "broken dataset configuration (%s)?" % (mtype, ds)) from e try: dsmeta_t, contentmeta_t = extractor.get_metadata( dataset=global_meta if global_meta is not None else ds.config.obtain( 'datalad.metadata.aggregate-dataset-{}'.format( mtype.replace('_', '-')), default=True, valtype=EnsureBool()), content=content_meta if content_meta is not None else ds.config.obtain( 'datalad.metadata.aggregate-content-{}'.format( mtype.replace('_', '-')), default=True, valtype=EnsureBool())) except Exception as e: lgr.error('Failed to get dataset metadata (%s): %s', mtype, CapturedException(e)) if cfg.get('datalad.runtime.raiseonerror'): log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', mtype_key, ds, ) raise errored = True # if we dont get global metadata we do not want content metadata continue if dsmeta_t: if _ok_metadata(dsmeta_t, mtype, ds, None): dsmeta_t = _filter_metadata_fields(dsmeta_t, maxsize=max_fieldsize, blacklist=blacklist) dsmeta[mtype_key] = dsmeta_t else: errored = True unique_cm = {} extractor_unique_exclude = getattr(extractor_cls, "_unique_exclude", set()) # TODO: ATM neuroimaging extractors all provide their own internal # log_progress but if they are all generators, we could provide generic # handling of the progress here. Note also that log message is actually # seems to be ignored and not used, only the label ;-) # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'Metadata extraction per location for %s', mtype, # # contentmeta_t is a generator... so no count is known # # total=len(contentmeta_t or []), # label='Metadata extraction per location', # unit=' locations', # ) for loc, meta in contentmeta_t or {}: lgr.log(5, "Analyzing metadata for %s", loc) # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'ignoredatm', # label=loc, # update=1, # increment=True) if not _ok_metadata(meta, mtype, ds, loc): errored = True # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'ignoredatm', # label='Failed for %s' % loc, # ) continue # we also want to store info that there was no metadata(e.g. to get a list of # files that have no metadata) # if there is an issue that a extractor needlessly produces empty records, the # extractor should be fixed and not a general switch. For example the datalad_core # issues empty records to document the presence of a file #elif not meta: # continue # apply filters meta = _filter_metadata_fields(meta, maxsize=max_fieldsize, blacklist=blacklist) if not meta: continue # assign # only ask each metadata extractor once, hence no conflict possible loc_dict = contentmeta.get(loc, {}) loc_dict[mtype_key] = meta contentmeta[loc] = loc_dict if ds.config.obtain('datalad.metadata.generate-unique-{}'.format( mtype_key.replace('_', '-')), default=True, valtype=EnsureBool()): # go through content metadata and inject report of unique keys # and values into `dsmeta` for k, v in meta.items(): if k in dsmeta.get(mtype_key, {}): # if the dataset already has a dedicated idea # about a key, we skip it from the unique list # the point of the list is to make missing info about # content known in the dataset, not to blindly # duplicate metadata. Example: list of samples data # were recorded from. If the dataset has such under # a 'sample' key, we should prefer that, over an # aggregated list of a hopefully-kinda-ok structure continue elif k in extractor_unique_exclude: # the extractor thinks this key is worthless for the purpose # of discovering whole datasets # we keep the key (so we know that some file is providing this key), # but ignore any value it came with unique_cm[k] = None continue vset = unique_cm.get(k, set()) vset.add(_val2hashable(v)) unique_cm[k] = vset # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'Finished metadata extraction across locations for %s', mtype) if unique_cm: # per source storage here too ucp = dsmeta.get('datalad_unique_content_properties', {}) # important: we want to have a stable order regarding # the unique values (a list). we cannot guarantee the # same order of discovery, hence even when not using a # set above we would still need sorting. the callenge # is that any value can be an arbitrarily complex nested # beast # we also want to have each unique value set always come # in a top-level list, so we known if some unique value # was a list, os opposed to a list of unique values def _ensure_serializable(val): if isinstance(val, ReadOnlyDict): return {k: _ensure_serializable(v) for k, v in val.items()} if isinstance(val, (tuple, list)): return [_ensure_serializable(v) for v in val] else: return val ucp[mtype_key] = { k: [ _ensure_serializable(i) for i in sorted(v, key=_unique_value_key) ] if v is not None else None for k, v in unique_cm.items() # v == None (disable unique, but there was a value at some point) # otherwise we only want actual values, and also no single-item-lists # of a non-value # those contribute no information, but bloat the operation # (inflated number of keys, inflated storage, inflated search index, ...) if v is None or (v and not v == {''}) } dsmeta['datalad_unique_content_properties'] = ucp log_progress( lgr.info, 'metadataextractors', 'Finished metadata extraction from %s', ds, ) # always identify the effective vocabulary - JSON-LD style if context: dsmeta['@context'] = context return dsmeta, contentmeta, errored