Esempio n. 1
0
 def custom_result_summary_renderer(results):  # pragma: no cover
     # fish out sizes of annexed files. those will only be present
     # with --annex ...
     annexed = [
         (int(r['bytesize']), r.get('has_content', None))
         for r in results
         if r.get('action', None) == 'status' \
         and 'key' in r and 'bytesize' in r]
     if annexed:
         have_availability = any(a[1] is not None for a in annexed)
         total_size = bytes2human(sum(a[0] for a in annexed))
         # we have availability info encoded in the results
         from datalad.ui import ui
         if have_availability:
             ui.message(
                 "{} annex'd {} ({}/{} present/total size)".format(
                     len(annexed),
                     single_or_plural('file', 'files', len(annexed)),
                     bytes2human(sum(a[0] for a in annexed if a[1])),
                     total_size))
         else:
             ui.message(
                 "{} annex'd {} ({} recorded total size)".format(
                     len(annexed),
                     single_or_plural('file', 'files', len(annexed)),
                     total_size))
Esempio n. 2
0
 def custom_result_summary_renderer(results):  # pragma: more cover
     # fish out sizes of annexed files. those will only be present
     # with --annex ...
     annexed = [
         (int(r['bytesize']), r.get('has_content', None))
         for r in results
         if r.get('action', None) == 'status' \
         and 'key' in r and 'bytesize' in r]
     if annexed:
         have_availability = any(a[1] is not None for a in annexed)
         total_size = bytes2human(sum(a[0] for a in annexed))
         # we have availability info encoded in the results
         from datalad.ui import ui
         if have_availability:
             ui.message("{} annex'd {} ({}/{} present/total size)".format(
                 len(annexed),
                 single_or_plural('file', 'files', len(annexed)),
                 bytes2human(sum(a[0] for a in annexed if a[1])),
                 total_size))
         else:
             ui.message("{} annex'd {} ({} recorded total size)".format(
                 len(annexed),
                 single_or_plural('file', 'files', len(annexed)),
                 total_size))
     if all(
             r.get('action', None) == 'status'
             and r.get('state', None) == 'clean' for r in results):
         from datalad.ui import ui
         ui.message("nothing to save, working tree clean")
Esempio n. 3
0
    def __call__(self, query, max_nresults=None, force_reindex=False):
        with self.idx_obj.searcher() as searcher:
            wquery = self.get_query(query)

            # perform the actual search
            hits = searcher.search(
                wquery,
                terms=True,
                limit=max_nresults if max_nresults > 0 else None)
            # report query stats
            topstr = '{} top {}'.format(
                max_nresults,
                single_or_plural('match', 'matches', max_nresults)
            )
            lgr.info('Query completed in {} sec.{}'.format(
                hits.runtime,
                ' Reporting {}.'.format(
                    ('up to ' + topstr)
                    if max_nresults > 0
                    else 'all matches'
                )
                if not hits.is_empty()
                else ' No matches.'
            ))

            if not hits:
                return

            nhits = 0
            # annotate hits for full metadata report
            hits = [dict(
                path=normpath(opj(self.ds.path, hit['path'])),
                query_matched={assure_unicode(k): assure_unicode(v)
                               if isinstance(v, unicode_srctypes) else v
                               for k, v in hit.matched_terms()},
                parentds=normpath(
                    opj(self.ds.path, hit['parentds'])) if 'parentds' in hit else None,
                type=hit.get('type', None))
                for hit in hits]
            for res in query_aggregated_metadata(
                    # type is taken from hit record
                    reporton=None,
                    ds=self.ds,
                    aps=hits,
                    # never recursive, we have direct hits already
                    recursive=False):
                res.update(
                    refds=self.ds.path,
                    action='search',
                    status='ok',
                    logger=lgr,
                )
                yield res
                nhits += 1

            if max_nresults and nhits == max_nresults:
                lgr.info(
                    "Reached the limit of {}, there could be more which "
                    "were not reported.".format(topstr)
                )
Esempio n. 4
0
    def result_renderer_cmdline(res, args):
        from datalad.ui import ui
        from os import linesep
        if res is None:
            res = []
        if not isinstance(res, list):
            res = [res]
        if not len(res):
            ui.message("Got nothing new")
            return

        # provide summary
        nsuccess = sum(item.get('success', False) if isinstance(item, dict) else True
                       for item in res)
        nfailure = len(res) - nsuccess
        msg = "Tried to get %d %s." % (
            len(res), single_or_plural("file", "files", len(res)))
        if nsuccess:
            msg += " Got %d. " % nsuccess
        if nfailure:
            msg += " Failed to get %d." % (nfailure,)
        ui.message(msg)

        # if just a few or less than initially explicitly requested
        if len(res) < 10 or args.verbose:
            msg = linesep.join([
                "{path} ... {suc}".format(
                    suc="ok." if isinstance(item, Dataset) or item.get('success', False)
                        else "failed. (%s)" % item.get('note', 'unknown reason'),
                    path=item.get('file') if isinstance(item, dict) else item.path)
                for item in res])
            ui.message(msg)
Esempio n. 5
0
    def custom_result_summary_renderer(res):
        from datalad.ui import ui
        from os import linesep
        if not len(res):
            ui.message("Got nothing new")
            return

        nfiles = count_results(res, type='file')
        nsuccess_file = count_results(res, type='file', status='ok')
        nfailure = nfiles - nsuccess_file
        msg = "Tried to get %d %s that had no content yet." % (
            nfiles, single_or_plural("file", "files", nfiles))
        if nsuccess_file:
            msg += " Successfully obtained %d. " % nsuccess_file
        if nfailure:
            msg += " %d (failed)." % (nfailure, )
        ui.message(msg)

        # if just a few or less than initially explicitly requested
        if len(res) < 10:
            msg = linesep.join([
                "{path}{type} ... {suc}".format(
                    suc=item.get('status'),
                    path=item.get('path'),
                    type=' [{}]'.format(item['type'])
                    if 'type' in item else '') for item in res
            ])
            ui.message(msg)
Esempio n. 6
0
File: get.py Progetto: hanke/datalad
    def custom_result_summary_renderer(res):
        from datalad.ui import ui
        from os import linesep
        if not len(res):
            ui.message("Got nothing new")
            return

        nfiles = count_results(res, type='file')
        nsuccess_file = count_results(res, type='file', status='ok')
        nfailure = nfiles - nsuccess_file
        msg = "Tried to get %d %s that had no content yet." % (
            nfiles, single_or_plural("file", "files", nfiles))
        if nsuccess_file:
            msg += " Successfully obtained %d. " % nsuccess_file
        if nfailure:
            msg += " %d (failed)." % (nfailure,)
        ui.message(msg)

        # if just a few or less than initially explicitly requested
        if len(res) < 10:
            msg = linesep.join([
                "{path}{type} ... {suc}".format(
                    suc=item.get('status'),
                    path=item.get('path'),
                    type=' [{}]'.format(item['type']) if 'type' in item else '')
                for item in res])
            ui.message(msg)
Esempio n. 7
0
    def _handle_and_return_installed_items(ds, installed_items, failed_items, save):
        if save and ds is not None:
            _save_installed_datasets(ds, installed_items)
        if failed_items:
            msg = ''
            for act, l in (("succeeded", installed_items), ("failed", failed_items)):
                if not l:
                    continue
                if msg:
                    msg += ', and '
                msg += "%s %s" % (
                  single_or_plural("dataset", "datasets", len(l),
                                   include_count=True),
                  act)
                if ds:
                    paths = [relpath(i.path, ds.path)
                             if hasattr(i, 'path')
                             else i if not i.startswith(ds.path) else relpath(i, ds.path)
                             for i in l]
                else:
                    paths = l
                msg += " (%s)" % (", ".join(map(str, paths)))
            msg += ' to install'

            # we were asked for multiple installations
            if installed_items or len(failed_items) > 1:
                raise IncompleteResultsError(
                    results=installed_items, failed=failed_items, msg=msg)
            else:
                raise InstallFailedError(msg=msg)

        return installed_items[0] \
            if len(installed_items) == 1 else installed_items
Esempio n. 8
0
File: get.py Progetto: silky/datalad
    def result_renderer_cmdline(res, args):
        from datalad.ui import ui
        from os import linesep
        if res is None:
            res = []
        if not isinstance(res, list):
            res = [res]
        if not len(res):
            ui.message("Got nothing new")
            return

        # provide summary
        nsuccess = sum(
            item.get('success', False) if isinstance(item, dict) else True
            for item in res)
        nfailure = len(res) - nsuccess
        msg = "Tried to get %d %s." % (
            len(res), single_or_plural("file", "files", len(res)))
        if nsuccess:
            msg += " Got %d. " % nsuccess
        if nfailure:
            msg += " Failed to get %d." % (nfailure, )
        ui.message(msg)

        # if just a few or less than initially explicitly requested
        if len(res) < 10 or args.verbose:
            msg = linesep.join([
                "{path} ... {suc}".format(
                    suc="ok." if isinstance(item, Dataset)
                    or item.get('success', False) else "failed. (%s)" %
                    item.get('note', 'unknown reason'),
                    path=item.get('file')
                    if isinstance(item, dict) else item.path) for item in res
            ])
            ui.message(msg)
Esempio n. 9
0
    def _handle_and_return_installed_items(ds, installed_items, failed_items,
                                           save):
        if save and ds is not None:
            _save_installed_datasets(ds, installed_items)
        if failed_items:
            msg = ''
            for act, l in (("succeeded", installed_items), ("failed",
                                                            failed_items)):
                if not l:
                    continue
                if msg:
                    msg += ', and '
                msg += "%s %s" % (single_or_plural(
                    "dataset", "datasets", len(l), include_count=True), act)
                if ds:
                    paths = [
                        relpath(i.path, ds.path) if hasattr(i, 'path') else i
                        if not i.startswith(ds.path) else relpath(i, ds.path)
                        for i in l
                    ]
                else:
                    paths = l
                msg += " (%s)" % (", ".join(map(str, paths)))
            msg += ' to install'

            # we were asked for multiple installations
            if installed_items or len(failed_items) > 1:
                raise IncompleteResultsError(results=installed_items,
                                             failed=failed_items,
                                             msg=msg)
            else:
                raise InstallFailedError(msg=msg)

        return installed_items[0] \
            if len(installed_items) == 1 else installed_items
Esempio n. 10
0
    def __call__(self,
                 query,
                 max_nresults=None,
                 consider_ucn=False,
                 full_record=True):
        query_re = re.compile(self.get_query(query))

        nhits = 0
        for res in query_aggregated_metadata(
                reporton=self.documenttype,
                ds=self.ds,
                aps=[dict(path=self.ds.path, type='dataset')],
                # MIH: I cannot see a case when we would not want recursion (within
                # the metadata)
                recursive=True):
            # this assumes that files are reported after each dataset report,
            # and after a subsequent dataset report no files for the previous
            # dataset will be reported again
            meta = res.get('metadata', {})
            # produce a flattened metadata dict to search through
            doc = _meta2autofield_dict(meta,
                                       val2str=True,
                                       consider_ucn=consider_ucn)
            # use search instead of match to not just get hits at the start of the string
            # this will be slower, but avoids having to use actual regex syntax at the user
            # side even for simple queries
            # DOTALL is needed to handle multiline description fields and such, and still
            # be able to match content coming for a later field
            lgr.log(7, "Querying %s among %d items", query_re, len(doc))
            t0 = time()
            matches = {
                k: query_re.search(v.lower())
                for k, v in iteritems(doc)
            }
            dt = time() - t0
            lgr.log(7, "Finished querying in %f sec", dt)
            # retain what actually matched
            matches = {
                k: match.group()
                for k, match in matches.items() if match
            }
            if matches:
                hit = dict(
                    res,
                    action='search',
                    query_matched=matches,
                )
                yield hit
                nhits += 1
                if max_nresults and nhits == max_nresults:
                    # report query stats
                    topstr = '{} top {}'.format(
                        max_nresults,
                        single_or_plural('match', 'matches', max_nresults))
                    lgr.info(
                        "Reached the limit of {}, there could be more which "
                        "were not reported.".format(topstr))
                    break
Esempio n. 11
0
def _display_suppressed_message(nsimilar, ndisplayed, final=False):
    # +1 because there was the original result + nsimilar displayed.
    n_suppressed = nsimilar - ndisplayed + 1
    if n_suppressed > 0:
        ui.message('  [{} similar {} been suppressed]'.format(
            n_suppressed,
            single_or_plural("message has", "messages have", n_suppressed,
                             False)),
                   cr="\n" if final else "\r")
Esempio n. 12
0
 def custom_result_summary_renderer(results):  # pragma: no cover
     # fish out sizes of annexed files. those will only be present
     # with --annex ...
     annexed = [
         (int(r['bytesize']), r.get('has_content', False))
         for r in results
         if r.get('action', None) == 'status' \
         and 'key' in r and 'bytesize' in r]
     if annexed:
         from datalad.ui import ui
         ui.message(
             "{} annex'd {} ({}/{} present/total size)".format(
                 len(annexed),
                 single_or_plural('file', 'files', len(annexed)),
                 bytes2human(sum(a[0] for a in annexed if a[1])),
                 bytes2human(sum(a[0] for a in annexed))))
Esempio n. 13
0
def _display_suppressed_message(nsimilar, ndisplayed, last_ts, final=False):
    # +1 because there was the original result + nsimilar displayed.
    n_suppressed = nsimilar - ndisplayed + 1
    if n_suppressed > 0:
        ts = time()
        # rate-limit update of suppression message, with a large number
        # of fast-paced results updating for each one can result in more
        # CPU load than the actual processing
        # arbitrarily go for a 2Hz update frequency -- it "feels" good
        if last_ts is None or final or (ts - last_ts > 0.5):
            ui.message('  [{} similar {} been suppressed]'.format(
                n_suppressed,
                single_or_plural("message has", "messages have", n_suppressed,
                                 False)),
                       cr="\n" if final else "\r")
            return ts
    return last_ts
Esempio n. 14
0
    def _mk_search_index(self, force_reindex):
        """Generic entrypoint to index generation

        The actual work that determines the structure and content of the index
        is done by functions that are passed in as arguments

        `meta2doc` - must return dict for index document from result input
        """
        from whoosh import index as widx
        from .metadata import agginfo_relpath
        # what is the lastest state of aggregated metadata
        metadata_state = self.ds.repo.get_last_commit_hash(agginfo_relpath)
        # use location common to all index types, they would all invalidate
        # simultaneously
        stamp_fname = opj(self.index_dir, 'datalad_metadata_state')
        index_dir = opj(self.index_dir, self._mode_label)

        if (not force_reindex) and \
                exists(index_dir) and \
                exists(stamp_fname) and \
                open(stamp_fname).read() == metadata_state:
            try:
                # TODO check that the index schema is the same
                # as the one we would have used for reindexing
                # TODO support incremental re-indexing, whoosh can do it
                idx = widx.open_dir(index_dir)
                lgr.debug('Search index contains %i documents',
                          idx.doc_count())
                self.idx_obj = idx
                return
            except widx.LockError as e:
                raise e
            except widx.IndexError as e:
                # Generic index error.
                # we try to regenerate
                lgr.warning(
                    "Cannot open existing index %s (%s), will regenerate",
                    index_dir, exc_str(e))
            except widx.IndexVersionError as e:  # (msg, version, release=None)
                # Raised when you try to open an index using a format that the
                # current version of Whoosh cannot read. That is, when the index
                # you're trying to open is either not backward or forward
                # compatible with this version of Whoosh.
                # we try to regenerate
                lgr.warning(exc_str(e))
                pass
            except widx.OutOfDateError as e:
                # Raised when you try to commit changes to an index which is not
                # the latest generation.
                # this should not happen here, but if it does ... KABOOM
                raise
            except widx.EmptyIndexError as e:
                # Raised when you try to work with an index that has no indexed
                # terms.
                # we can just continue with generating an index
                pass
            except ValueError as e:
                if 'unsupported pickle protocol' in str(e):
                    lgr.warning(
                        "Cannot open existing index %s (%s), will regenerate",
                        index_dir, exc_str(e))
                else:
                    raise

        lgr.info('{} search index'.format(
            'Rebuilding' if exists(index_dir) else 'Building'))

        if not exists(index_dir):
            os.makedirs(index_dir)

        # this is a pretty cheap call that just pull this info from a file
        dsinfo = self.ds.metadata(get_aggregates=True,
                                  return_type='list',
                                  result_renderer='disabled')

        self._mk_schema(dsinfo)

        idx_obj = widx.create_in(index_dir, self.schema)
        idx = idx_obj.writer(
            # cache size per process
            limitmb=cfg.obtain('datalad.search.indexercachesize'),
            # disable parallel indexing for now till #1927 is resolved
            ## number of processes for indexing
            #procs=multiprocessing.cpu_count(),
            ## write separate index segments in each process for speed
            ## asks for writer.commit(optimize=True)
            #multisegment=True,
        )

        # load metadata of the base dataset and what it knows about all its subdatasets
        # (recursively)
        old_idx_size = 0
        old_ds_rpath = ''
        idx_size = 0
        log_progress(
            lgr.info,
            'autofieldidxbuild',
            'Start building search index',
            total=len(dsinfo),
            label='Building search index',
            unit=' Datasets',
        )
        for res in query_aggregated_metadata(
                reporton=self.documenttype,
                ds=self.ds,
                aps=[dict(path=self.ds.path, type='dataset')],
                # MIH: I cannot see a case when we would not want recursion (within
                # the metadata)
                recursive=True):
            # this assumes that files are reported after each dataset report,
            # and after a subsequent dataset report no files for the previous
            # dataset will be reported again
            meta = res.get('metadata', {})
            doc = self._meta2doc(meta)
            admin = {
                'type': res['type'],
                'path': relpath(res['path'], start=self.ds.path),
            }
            if 'parentds' in res:
                admin['parentds'] = relpath(res['parentds'],
                                            start=self.ds.path)
            if admin['type'] == 'dataset':
                if old_ds_rpath:
                    lgr.debug(
                        'Added %s on dataset %s',
                        single_or_plural('document',
                                         'documents',
                                         idx_size - old_idx_size,
                                         include_count=True), old_ds_rpath)
                log_progress(lgr.info,
                             'autofieldidxbuild',
                             'Indexed dataset at %s',
                             old_ds_rpath,
                             update=1,
                             increment=True)
                old_idx_size = idx_size
                old_ds_rpath = admin['path']
                admin['id'] = res.get('dsid', None)

            doc.update({k: assure_unicode(v) for k, v in admin.items()})
            lgr.debug("Adding document to search index: {}".format(doc))
            # inject into index
            idx.add_document(**doc)
            idx_size += 1

        if old_ds_rpath:
            lgr.debug(
                'Added %s on dataset %s',
                single_or_plural('document',
                                 'documents',
                                 idx_size - old_idx_size,
                                 include_count=True), old_ds_rpath)

        lgr.debug("Committing index")
        idx.commit(optimize=True)
        log_progress(lgr.info, 'autofieldidxbuild',
                     'Done building search index')

        # "timestamp" the search index to allow for automatic invalidation
        with open(stamp_fname, 'w') as f:
            f.write(metadata_state)

        lgr.info('Search index contains %i documents', idx_size)
        self.idx_obj = idx_obj
Esempio n. 15
0
    def __call__(
            path=None,
            dataset=None,
            get_aggregates=False,
            reporton='all',
            recursive=False):
        # prep results
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='metadata', logger=lgr)
        if refds_path:
            res_kwargs['refds'] = refds_path

        if get_aggregates:
            # yield all datasets for which we have aggregated metadata as results
            # the get actual dataset results, so we can turn them into dataset
            # instances using generic top-level code if desired
            ds = require_dataset(
                refds_path,
                check_installed=True,
                purpose='aggregate metadata query')
            info_fpath = opj(ds.path, agginfo_relpath)
            if not exists(info_fpath):
                # if there has ever been an aggregation run, this file would
                # exist, hence there has not been and we need to tell this
                # to people
                yield get_status_dict(
                    ds=ds,
                    status='impossible',
                    action='metadata',
                    logger=lgr,
                    message='metadata aggregation has never been performed in this dataset')
                return
            agginfos = _load_json_object(info_fpath)
            parentds = []
            for sd in sorted(agginfos):
                info = agginfos[sd]
                dspath = normpath(opj(ds.path, sd))
                if parentds and not path_is_subpath(dspath, parentds[-1]):
                    parentds.pop()
                info.update(
                    path=dspath,
                    type='dataset',
                    status='ok',
                )
                if sd == curdir:
                    info['layout_version'] = aggregate_layout_version
                if parentds:
                    info['parentds'] = parentds[-1]
                yield dict(
                    info,
                    **res_kwargs
                )
                parentds.append(dspath)
            return

        if not dataset and not path:
            # makes no sense to have no dataset, go with "here"
            # error generation happens during annotation
            path = curdir

        content_by_ds = OrderedDict()
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                # MIH: we are querying the aggregated metadata anyways, and that
                # mechanism has its own, faster way to go down the hierarchy
                #recursive=recursive,
                #recursion_limit=recursion_limit,
                action='metadata',
                # uninstalled subdatasets could be queried via aggregated metadata
                # -> no 'error'
                unavailable_path_status='',
                nondataset_path_status='error',
                # we need to know when to look into aggregated data
                force_subds_discovery=True,
                force_parentds_discovery=True,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo(ap['path']):
                ap['process_content'] = True
            to_query = None
            if ap.get('state', None) == 'absent' or \
                    ap.get('type', 'dataset') != 'dataset':
                # this is a lonely absent dataset/file or content in a present dataset
                # -> query through parent
                # there must be a parent, otherwise this would be a non-dataset path
                # and would have errored during annotation
                to_query = ap['parentds']
            else:
                to_query = ap['path']
            if to_query:
                pcontent = content_by_ds.get(to_query, [])
                pcontent.append(ap)
                content_by_ds[to_query] = pcontent

        # test for datasets that will be queried, but have never been aggregated
        # TODO add option, even even by default, re-aggregate metadata prior query
        # if it was found to be outdated.
        # This is superior to re-aggregation upon manipulation, as manipulation
        # can happen in a gazzilon ways and may even be incremental over multiple
        # steps where intermediate re-aggregation is pointless and wasteful
        to_aggregate = [d for d in content_by_ds
                        if not exists(opj(d, agginfo_relpath))]
        if to_aggregate:
            lgr.warning(
                'Metadata query results might be incomplete, initial '
                'metadata aggregation was not yet performed in %s at: %s',
                single_or_plural(
                    'dataset', 'datasets', len(to_aggregate), include_count=True),
                to_aggregate)

        for ds_path in content_by_ds:
            ds = Dataset(ds_path)
            query_agg = [ap for ap in content_by_ds[ds_path]
                         # this is an available subdataset, will be processed in another
                         # iteration
                         if ap.get('state', None) == 'absent' or
                         not(ap.get('type', None) == 'dataset' and ap['path'] != ds_path)]
            if not query_agg:
                continue
            # report from aggregated metadata
            for r in query_aggregated_metadata(
                    reporton,
                    # by default query the reference dataset, only if there is none
                    # try our luck in the dataset that contains the queried path
                    # this is consistent with e.g. `get_aggregates` reporting the
                    # situation in the reference dataset only
                    Dataset(refds_path) if refds_path else ds,
                    query_agg,
                    # recursion above could only recurse into datasets
                    # on the filesystem, but there might be any number of
                    # uninstalled datasets underneath the last installed one
                    # for which we might have metadata
                    recursive=recursive,
                    **res_kwargs):
                yield r
        return
Esempio n. 16
0
    def __call__(path,
                 dataset=None,
                 spec_file=None,
                 properties=None,
                 replace=False):
        # TODO: message

        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose="hirni spec4anything")
        path = assure_list(path)
        path = [resolve_path(p, dataset) for p in path]

        res_kwargs = dict(action='hirni spec4anything', logger=lgr)
        res_kwargs['refds'] = Interface.get_refds_path(dataset)

        # ### This might become superfluous. See datalad-gh-2653
        ds_path = PathRI(dataset.path)
        # ###

        updated_files = []
        paths = []
        for ap in AnnotatePaths.__call__(
                dataset=dataset,
                path=path,
                action='hirni spec4anything',
                unavailable_path_status='impossible',
                nondataset_path_status='error',
                return_type='generator',
                # TODO: Check this one out:
                on_failure='ignore',
                # Note/TODO: Not sure yet whether and when we need those.
                # Generally we want to be able to create a spec for subdatasets,
                # too:
                # recursive=recursive,
                # recursion_limit=recursion_limit,
                # force_subds_discovery=True,
                # force_parentds_discovery=True,
        ):

            if ap.get('status', None) in ['error', 'impossible']:
                yield ap
                continue

            # ### This might become superfluous. See datalad-gh-2653
            ap_path = PathRI(ap['path'])
            # ###

            # find acquisition and respective specification file:
            rel_path = posixpath.relpath(ap_path.posixpath, ds_path.posixpath)

            path_parts = rel_path.split('/')

            # TODO: Note: Outcommented this warning for now. We used to not have
            # a spec file at the toplevel of the study dataset, but now we do.
            # The logic afterwards works, but should be revisited. At least,
            # `acq` should be called differently now.
            # if len(path_parts) < 2:
            #     lgr.warning("Not within an acquisition")
            acq = path_parts[0]

            # TODO: spec file specifiable or fixed path?
            #       if we want the former, what we actually need is an
            #       association of acquisition and its spec path
            #       => prob. not an option but a config

            spec_path = spec_file if spec_file \
                else posixpath.join(ds_path.posixpath, acq,
                                    dataset.config.get("datalad.hirni.studyspec.filename",
                                                       "studyspec.json"))

            spec = [r for r in json_py.load_stream(spec_path)] \
                if posixpath.exists(spec_path) else list()

            lgr.debug("Add specification snippet for %s", ap['path'])
            # XXX 'add' does not seem to be the thing we want to do
            # rather 'set', so we have to check whether a spec for a location
            # is already known and fail or replace it (maybe with --force)

            # go through all existing specs and extract unique value
            # and also assign them to the new record (subjects, ...), but only
            # editable fields!!
            uniques = dict()
            for s in spec:
                for k in s:
                    if isinstance(s[k], dict) and 'value' in s[k]:
                        if k not in uniques:
                            uniques[k] = set()
                        uniques[k].add(s[k]['value'])
            overrides = dict()
            for k in uniques:
                if len(uniques[k]) == 1:
                    overrides[k] = _get_edit_dict(value=uniques[k].pop(),
                                                  approved=False)

            if properties:

                # TODO: This entire reading of properties needs to be RF'd
                # into proper generalized functions.
                # spec got more complex. update() prob. can't simply override
                # (think: 'procedures' and 'tags' prob. need to be appended
                # instead)

                # load from file or json string
                if isinstance(properties, dict):
                    props = properties
                elif op.exists(properties):
                    props = json_py.load(properties)
                else:
                    props = json_py.loads(properties)
                # turn into editable, pre-approved records
                spec_props = {
                    k: dict(value=v, approved=True)
                    for k, v in props.items()
                    if k not in non_editables + ['tags', 'procedures']
                }
                spec_props.update({
                    k: v
                    for k, v in props.items() if k in non_editables + ['tags']
                })

                # TODO: still wrong. It's a list. Append or override? How to decide?
                spec_props.update({
                    o_k: [{
                        i_k: dict(value=i_v, approved=True)
                        for i_k, i_v in o_v.items()
                    }]
                    for o_k, o_v in props.items() if o_k in ['procedures']
                })

                overrides.update(spec_props)

            # TODO: It's probably wrong to use uniques for overwriting! At least
            # they cannot be used to overwrite values explicitly set in
            # _add_to_spec like "location", "type", etc.
            #
            # But then: This should concern non-editable fields only, right?

            spec = _add_to_spec(spec,
                                posixpath.split(spec_path)[0],
                                ap,
                                dataset,
                                overrides=overrides,
                                replace=replace)

            # Note: Not sure whether we really want one commit per snippet.
            #       If not - consider:
            #       - What if we fail amidst? => Don't write to file yet.
            #       - What about input paths from different acquisitions?
            #         => store specs per acquisition in memory
            # MIH: One commit per line seems silly. why not update all files
            # collect paths of updated files, and give them to a single `add`
            # at the very end?
            # MIH: if we fail, we fail and nothing is committed
            from datalad_hirni.support.spec_helpers import sort_spec
            json_py.dump2stream(sorted(spec, key=lambda x: sort_spec(x)),
                                spec_path)
            updated_files.append(spec_path)

            yield get_status_dict(status='ok',
                                  type=ap['type'],
                                  path=ap['path'],
                                  **res_kwargs)
            paths.append(ap)

        from datalad.dochelpers import single_or_plural
        from os import linesep
        message = "[HIRNI] Add specification {n_snippets} for: {paths}".format(
            n_snippets=single_or_plural("snippet", "snippets", len(paths)),
            paths=linesep.join(" - " + op.relpath(p['path'], dataset.path)
                               for p in paths)
            if len(paths) > 1 else op.relpath(paths[0]['path'], dataset.path))
        for r in dataset.save(updated_files,
                              to_git=True,
                              message=message,
                              return_type='generator',
                              result_renderer='disabled'):
            yield r
Esempio n. 17
0
    def __call__(query=None,
                 dataset=None,
                 force_reindex=False,
                 max_nresults=20,
                 show_keys=False,
                 show_query=False):
        from whoosh import qparser as qparse

        try:
            ds = require_dataset(dataset,
                                 check_installed=True,
                                 purpose='dataset search')
            if ds.id is None:
                raise NoDatasetArgumentFound(
                    "This does not seem to be a dataset (no DataLad dataset ID "
                    "found). 'datalad create --force %s' can initialize "
                    "this repository as a DataLad dataset" % ds.path)
        except NoDatasetArgumentFound:
            for r in _search_from_virgin_install(dataset, query):
                yield r
            return

        # where does the bunny have the eggs?
        index_dir = opj(ds.path, get_git_dir(ds.path), 'datalad',
                        'search_index')

        idx_obj = _get_search_index(index_dir, ds, force_reindex)

        if show_keys:
            definitions_fname = opj(index_dir,
                                    'datalad_term_definitions.json.gz')
            try:
                defs = jsonload(gzopen(definitions_fname))
            except Exception as e:
                lgr.warning(
                    'No term definitions found alongside search index: %s',
                    exc_str(e))
                defs = {}

            for k in idx_obj.schema.names():
                print('{}{}'.format(
                    k,
                    ' {}'.format(defs[k] if isinstance(defs[k], dict) else
                                 '({})'.format(defs[k])) if k in defs else ''))
            return

        if not query:
            return

        with idx_obj.searcher() as searcher:
            # parse the query string, default whoosh parser ATM, could be
            # tailored with plugins
            parser = qparse.MultifieldParser(idx_obj.schema.names(),
                                             idx_obj.schema)
            # XXX: plugin is broken in Debian's whoosh 2.7.0-2, but already fixed
            # upstream
            parser.add_plugin(qparse.FuzzyTermPlugin())
            parser.add_plugin(qparse.GtLtPlugin())
            # replace field defintion to allow for colons to be part of a field's name:
            parser.replace_plugin(
                qparse.FieldsPlugin(expr=r"(?P<text>[()<>:\w]+|[*]):"))
            # for convenience we accept any number of args-words from the
            # shell and put them together to a single string here
            querystr = ' '.join(assure_list(query))
            # this gives a formal whoosh query
            wquery = parser.parse(querystr)

            if show_query:
                print(wquery)
                return
            # perform the actual search
            hits = searcher.search(
                wquery,
                terms=True,
                limit=max_nresults if max_nresults > 0 else None)
            # cheap way to get an approximate number of hits, without an expensive
            # scoring of all items
            # disabled: unreliable estimate, often confusing
            #nhits = hits.estimated_min_length()
            # report query stats
            topstr = '{} top {}'.format(
                max_nresults, single_or_plural('match', 'matches',
                                               max_nresults))
            lgr.info('Query completed in {} sec.{}'.format(
                hits.runtime, ' Reporting {}.'.format((
                    'up to ' + topstr) if max_nresults > 0 else 'all matches')
                if not hits.is_empty() else ' No matches.'))

            if not hits:
                return

            nhits = 0
            for hit in hits:
                res = dict(
                    action='search',
                    status='ok',
                    logger=lgr,
                    refds=ds.path,
                    # normpath to avoid trailing dot
                    path=normpath(opj(ds.path, hit['path'])),
                    query_matched={
                        assure_unicode(k): assure_unicode(v) if isinstance(
                            v, unicode_srctypes) else v
                        for k, v in hit.matched_terms()
                    },
                    metadata={
                        k: v
                        for k, v in hit.fields().items()
                        if k not in ('path', 'parentds')
                    })
                if 'parentds' in hit:
                    res['parentds'] = normpath(opj(ds.path, hit['parentds']))
                yield res
                nhits += 1

            if max_nresults and nhits == max_nresults:
                lgr.info("Reached the limit of {}, there could be more which "
                         "were not reported.".format(topstr))
Esempio n. 18
0
def _get_search_index(index_dir, ds, force_reindex):
    from whoosh import index as widx
    from .metadata import agginfo_relpath
    # what is the lastest state of aggregated metadata
    metadata_state = ds.repo.get_last_commit_hash(agginfo_relpath)
    stamp_fname = opj(index_dir, 'datalad_metadata_state')
    definitions_fname = opj(index_dir, 'datalad_term_definitions.json.gz')

    if not force_reindex and \
            exists(stamp_fname) and \
            open(stamp_fname).read() == metadata_state:
        try:
            # TODO check that the index schema is the same
            # as the one we would have used for reindexing
            # TODO support incremental re-indexing, whoosh can do it
            idx = widx.open_dir(index_dir)
            lgr.debug('Search index contains %i documents', idx.doc_count())
            return idx
        except widx.LockError as e:
            raise e
        except widx.IndexError as e:
            # Generic index error.
            # we try to regenerate
            # TODO log this
            pass
        except widx.IndexVersionError as e:  # (msg, version, release=None)
            # Raised when you try to open an index using a format that the
            # current version of Whoosh cannot read. That is, when the index
            # you're trying to open is either not backward or forward
            # compatible with this version of Whoosh.
            # we try to regenerate
            # TODO log this
            pass
        except widx.OutOfDateError as e:
            # Raised when you try to commit changes to an index which is not
            # the latest generation.
            # this should not happen here, but if it does ... KABOOM
            raise e
        except widx.EmptyIndexError as e:
            # Raised when you try to work with an index that has no indexed
            # terms.
            # we can just continue with generating an index
            pass

    lgr.info('{} search index'.format(
        'Rebuilding' if exists(index_dir) else 'Building'))

    if not exists(index_dir):
        os.makedirs(index_dir)

    schema, definitions, per_ds_defs = _get_search_schema(ds)

    idx_obj = widx.create_in(index_dir, schema)
    idx = idx_obj.writer(
        # cache size per process
        limitmb=cfg.obtain('datalad.search.indexercachesize'),
        # disable parallel indexing for now till #1927 is resolved
        ## number of processes for indexing
        #procs=multiprocessing.cpu_count(),
        ## write separate index segments in each process for speed
        ## asks for writer.commit(optimize=True)
        #multisegment=True,
    )

    # load metadata of the base dataset and what it knows about all its subdatasets
    # (recursively)
    old_idx_size = 0
    old_ds_rpath = ''
    idx_size = 0
    for res in _query_aggregated_metadata(
            reporton=ds.config.obtain(
                'datalad.metadata.searchindex-documenttype'),
            ds=ds,
            aps=[dict(path=ds.path, type='dataset')],
            # TODO expose? but this would likely only affect metadata in the
            # base dataset
            merge_mode='init',
            # MIH: I cannot see a case when we would not want recursion (within
            # the metadata)
            recursive=True):
        rpath = relpath(res['path'], start=ds.path)
        # this assumes that files are reported after each dataset report,
        # and after a subsequent dataset report no files for the previous
        # dataset will be reported again
        rtype = res['type']
        meta = res.get('metadata', {})
        meta = MetadataDict(meta)
        if rtype == 'dataset':
            if old_ds_rpath:
                lgr.info(
                    'Added %s on dataset %s',
                    single_or_plural('document',
                                     'documents',
                                     idx_size - old_idx_size,
                                     include_count=True), old_ds_rpath)
            old_idx_size = idx_size
            old_ds_rpath = rpath

            # get any custom dataset mappings
            ds_defs = per_ds_defs.get(res['path'], {})
            # now we merge all reported unique content properties (flattened representation
            # of content metadata) with the main metadata set, using the 'add' strategy
            # this way any existing metadata value of a dataset itself will be amended by
            # those coming from the content. E.g. a single dataset 'license' might be turned
            # into a sequence of unique license identifiers across all dataset components
            meta.merge_add(meta.get('unique_content_properties', {}))
            meta.pop('unique_content_properties', None)
        doc_props = dict(path=rpath,
                         type=rtype,
                         **_meta2index_dict(meta, definitions, ds_defs))
        if 'parentds' in res:
            doc_props['parentds'] = relpath(res['parentds'], start=ds.path)
        _add_document(idx, **doc_props)
        idx_size += 1

    if old_ds_rpath:
        lgr.info(
            'Added %s on dataset %s',
            single_or_plural('document',
                             'documents',
                             idx_size - old_idx_size,
                             include_count=True), old_ds_rpath)

    idx.commit(optimize=True)

    # "timestamp" the search index to allow for automatic invalidation
    with open(stamp_fname, 'w') as f:
        f.write(metadata_state)

    # dump the term/field definitions records for later introspection
    # use compressed storage, the is not point in inflating the
    # diskspace requirements
    with gzopen(definitions_fname, 'wb') as f:
        # TODO actually go through all, incl. compound, defintions ('@id' plus 'unit'
        # or similar) and resolve terms to URLs, if anyhow possible
        jsondump2file(definitions, f)

    lgr.info('Search index contains %i documents', idx_size)
    return idx_obj
Esempio n. 19
0
    def __call__(self,
                 query,
                 max_nresults=None,
                 consider_ucn=False,
                 full_record=True):
        if max_nresults is None:
            # no limit by default
            max_nresults = 0
        query = self.get_query(query)

        nhits = 0
        for res in query_aggregated_metadata(
                reporton=self.documenttype,
                ds=self.ds,
                aps=[dict(path=self.ds.path, type='dataset')],
                # MIH: I cannot see a case when we would not want recursion (within
                # the metadata)
                recursive=True):
            # this assumes that files are reported after each dataset report,
            # and after a subsequent dataset report no files for the previous
            # dataset will be reported again
            meta = res.get('metadata', {})
            # produce a flattened metadata dict to search through
            doc = _meta2autofield_dict(meta,
                                       val2str=True,
                                       consider_ucn=consider_ucn)
            # inject a few basic properties into the dict
            # analog to what the other modes do in their index
            doc.update({
                k: res[k]
                for k in ('@id', 'type', 'path', 'parentds') if k in res
            })
            # use search instead of match to not just get hits at the start of the string
            # this will be slower, but avoids having to use actual regex syntax at the user
            # side even for simple queries
            # DOTALL is needed to handle multiline description fields and such, and still
            # be able to match content coming for a later field
            lgr.log(7, "Querying %s among %d items", query, len(doc))
            t0 = time()
            matches = {
                (q['query'] if isinstance(q, dict) else q, k):
                q['query'].search(v) if isinstance(q, dict) else q.search(v)
                for k, v in iteritems(doc) for q in query
                if not isinstance(q, dict) or q['field'].match(k)
            }
            dt = time() - t0
            lgr.log(7, "Finished querying in %f sec", dt)
            # retain what actually matched
            matched = {
                k[1]: match.group()
                for k, match in matches.items() if match
            }
            # implement AND behavior across query expressions, but OR behavior
            # across queries matching multiple fields for a single query expression
            # for multiple queries, this makes it consistent with a query that
            # has no field specification
            if matched and len(query) == len(
                    set(k[0] for k in matches if matches[k])):
                hit = dict(
                    res,
                    action='search',
                    query_matched=matched,
                )
                yield hit
                nhits += 1
                if max_nresults and nhits == max_nresults:
                    # report query stats
                    topstr = '{} top {}'.format(
                        max_nresults,
                        single_or_plural('match', 'matches', max_nresults))
                    lgr.info(
                        "Reached the limit of {}, there could be more which "
                        "were not reported.".format(topstr))
                    break
Esempio n. 20
0
def postclonecfg_annexdataset(ds, reckless, description=None):
    """If ds "knows annex" -- annex init it, set into reckless etc

    Provides additional tune up to a possibly an annex repo, e.g.
    "enables" reckless mode, sets up description
    """
    # in any case check whether we need to annex-init the installed thing:
    if not knows_annex(ds.path):
        # not for us
        return

    # init annex when traces of a remote annex can be detected
    if reckless == 'auto':
        lgr.debug(
            "Instruct annex to hardlink content in %s from local "
            "sources, if possible (reckless)", ds.path)
        ds.config.set(
            'annex.hardlink', 'true', where='local', reload=True)

    lgr.debug("Initializing annex repo at %s", ds.path)
    # Note, that we cannot enforce annex-init via AnnexRepo().
    # If such an instance already exists, its __init__ will not be executed.
    # Therefore do quick test once we have an object and decide whether to call
    # its _init().
    #
    # Additionally, call init if we need to add a description (see #1403),
    # since AnnexRepo.__init__ can only do it with create=True
    repo = AnnexRepo(ds.path, init=True)
    if not repo.is_initialized() or description:
        repo._init(description=description)
    if reckless == 'auto' or (reckless and reckless.startswith('shared-')):
        repo.call_annex(['untrust', 'here'])

    elif reckless == 'ephemeral':
        # with ephemeral we declare 'here' as 'dead' right away, whenever
        # we symlink origin's annex, since availability from 'here' should
        # not be propagated for an ephemeral clone when we publish back to
        # origin.
        # This will cause stuff like this for a locally present annexed file:
        # % git annex whereis d1
        # whereis d1 (0 copies) failed
        # BUT this works:
        # % git annex find . --not --in here
        # % git annex find . --in here
        # d1

        # we don't want annex copy-to origin
        ds.config.set(
            'remote.origin.annex-ignore', 'true',
            where='local')

        ds.repo.set_remote_dead('here')

        if check_symlink_capability(ds.repo.dot_git / 'dl_link_test',
                                    ds.repo.dot_git / 'dl_target_test'):
            # symlink the annex to avoid needless copies in an ephemeral clone
            annex_dir = ds.repo.dot_git / 'annex'
            origin_annex_url = ds.config.get("remote.origin.url", None)
            origin_git_path = None
            if origin_annex_url:
                try:
                    # Deal with file:// scheme URLs as well as plain paths.
                    # If origin isn't local, we have nothing to do.
                    origin_git_path = Path(RI(origin_annex_url).localpath)

                    # we are local; check for a bare repo first to not mess w/
                    # the path
                    if GitRepo(origin_git_path, create=False).bare:
                        # origin is a bare repo -> use path as is
                        pass
                    elif origin_git_path.name != '.git':
                        origin_git_path /= '.git'
                except ValueError:
                    # Note, that accessing localpath on a non-local RI throws
                    # ValueError rather than resulting in an AttributeError.
                    # TODO: Warning level okay or is info level sufficient?
                    # Note, that setting annex-dead is independent of
                    # symlinking .git/annex. It might still make sense to
                    # have an ephemeral clone that doesn't propagate its avail.
                    # info. Therefore don't fail altogether.
                    lgr.warning("reckless=ephemeral mode: origin doesn't seem "
                                "local: %s\nno symlinks being used",
                                origin_annex_url)
            if origin_git_path:
                # TODO make sure that we do not delete any unique data
                rmtree(str(annex_dir)) \
                    if not annex_dir.is_symlink() else annex_dir.unlink()
                annex_dir.symlink_to(origin_git_path / 'annex',
                                     target_is_directory=True)
        else:
            # TODO: What level? + note, that annex-dead is independ
            lgr.warning("reckless=ephemeral mode: Unable to create symlinks on "
                        "this file system.")

    srs = {True: [], False: []}  # special remotes by "autoenable" key
    remote_uuids = None  # might be necessary to discover known UUIDs

    repo_config = repo.config
    # Note: The purpose of this function is to inform the user. So if something
    # looks misconfigured, we'll warn and move on to the next item.
    for uuid, config in repo.get_special_remotes().items():
        sr_name = config.get('name', None)
        if sr_name is None:
            lgr.warning(
                'Ignoring special remote %s because it does not have a name. '
                'Known information: %s',
                uuid, config)
            continue
        sr_autoenable = config.get('autoenable', False)
        try:
            sr_autoenable = ensure_bool(sr_autoenable)
        except ValueError:
            lgr.warning(
                'Failed to process "autoenable" value %r for sibling %s in '
                'dataset %s as bool.'
                'You might need to enable it later manually and/or fix it up to'
                ' avoid this message in the future.',
                sr_autoenable, sr_name, ds.path)
            continue

        # If it looks like a type=git special remote, make sure we have up to
        # date information. See gh-2897.
        if sr_autoenable and repo_config.get("remote.{}.fetch".format(sr_name)):
            try:
                repo.fetch(remote=sr_name)
            except CommandError as exc:
                lgr.warning("Failed to fetch type=git special remote %s: %s",
                            sr_name, exc_str(exc))

        # determine whether there is a registered remote with matching UUID
        if uuid:
            if remote_uuids is None:
                remote_uuids = {
                    # Check annex-config-uuid first. For sameas annex remotes,
                    # this will point to the UUID for the configuration (i.e.
                    # the key returned by get_special_remotes) rather than the
                    # shared UUID.
                    (repo_config.get('remote.%s.annex-config-uuid' % r) or
                     repo_config.get('remote.%s.annex-uuid' % r))
                    for r in repo.get_remotes()
                }
            if uuid not in remote_uuids:
                srs[sr_autoenable].append(sr_name)

    if srs[True]:
        lgr.debug(
            "configuration for %s %s added because of autoenable,"
            " but no UUIDs for them yet known for dataset %s",
            # since we are only at debug level, we could call things their
            # proper names
            single_or_plural("special remote",
                             "special remotes", len(srs[True]), True),
            ", ".join(srs[True]),
            ds.path
        )

    if srs[False]:
        # if has no auto-enable special remotes
        lgr.info(
            'access to %s %s not auto-enabled, enable with:\n'
            '\t\tdatalad siblings -d "%s" enable -s %s',
            # but since humans might read it, we better confuse them with our
            # own terms!
            single_or_plural("dataset sibling",
                             "dataset siblings", len(srs[False]), True),
            ", ".join(srs[False]),
            ds.path,
            srs[False][0] if len(srs[False]) == 1 else "SIBLING",
        )

    # we have just cloned the repo, so it has 'origin', configure any
    # reachable origin of origins
    yield from configure_origins(ds, ds)
Esempio n. 21
0
def _get_metadata(ds, types, global_meta=None, content_meta=None, paths=None):
    """Make a direct query of a dataset to extract its metadata.

    Parameters
    ----------
    ds : Dataset
    types : list
    """
    errored = False
    dsmeta = dict()
    contentmeta = {}

    if global_meta is not None and content_meta is not None and \
            not global_meta and not content_meta:
        # both are false and not just none
        return dsmeta, contentmeta, errored

    context = {
        '@vocab':
        'http://docs.datalad.org/schema_v{}.json'.format(vocabulary_version)
    }

    fullpathlist = paths
    if paths and isinstance(ds.repo, AnnexRepo):
        # Ugly? Jep: #2055
        content_info = zip(paths, ds.repo.file_has_content(paths),
                           ds.repo.is_under_annex(paths))
        paths = [p for p, c, a in content_info if not a or c]
        nocontent = len(fullpathlist) - len(paths)
        if nocontent:
            # TODO better fail, or support incremental and label this file as no present
            lgr.warning('{} files have no content present, '
                        'some extractors will not operate on {}'.format(
                            nocontent, 'them' if nocontent > 10 else
                            [p for p, c, a in content_info if not c and a]))

    # pull out potential metadata field blacklist config settings
    blacklist = [
        re.compile(bl) for bl in ensure_list(
            ds.config.obtain('datalad.metadata.aggregate-ignore-fields',
                             default=[]))
    ]
    # enforce size limits
    max_fieldsize = ds.config.obtain('datalad.metadata.maxfieldsize')
    # keep local, who knows what some extractors might pull in
    from pkg_resources import iter_entry_points  # delayed heavy import
    extractors = {
        ep.name: ep
        for ep in iter_entry_points('datalad.metadata.extractors')
    }

    # we said that we want to fail, rather then just moan about less metadata
    # Do an early check if all extractors are available so not to wait hours
    # and then crash for some obvious reason
    absent_extractors = [t for t in types if t not in extractors]
    if absent_extractors:
        raise ValueError(
            '%d enabled metadata extractor%s not available in this installation'
            ': %s' % (len(absent_extractors),
                      single_or_plural(" is", "s are", len(absent_extractors)),
                      ', '.join(absent_extractors)))

    log_progress(
        lgr.info,
        'metadataextractors',
        'Start metadata extraction from %s',
        ds,
        total=len(types),
        label='Metadata extraction',
        unit=' extractors',
    )
    for mtype in types:
        mtype_key = mtype
        log_progress(lgr.info,
                     'metadataextractors',
                     'Engage %s metadata extractor',
                     mtype_key,
                     update=1,
                     increment=True)
        try:
            extractor_cls = extractors[mtype_key].load()
            extractor = extractor_cls(
                ds,
                paths=paths if extractor_cls.NEEDS_CONTENT else fullpathlist)
        except Exception as e:
            log_progress(
                lgr.error,
                'metadataextractors',
                'Failed %s metadata extraction from %s',
                mtype_key,
                ds,
            )
            raise ValueError("Failed to load metadata extractor for '%s', "
                             "broken dataset configuration (%s)?: %s" %
                             (mtype, ds, exc_str(e)))
        try:
            dsmeta_t, contentmeta_t = extractor.get_metadata(
                dataset=global_meta
                if global_meta is not None else ds.config.obtain(
                    'datalad.metadata.aggregate-dataset-{}'.format(
                        mtype.replace('_', '-')),
                    default=True,
                    valtype=EnsureBool()),
                content=content_meta
                if content_meta is not None else ds.config.obtain(
                    'datalad.metadata.aggregate-content-{}'.format(
                        mtype.replace('_', '-')),
                    default=True,
                    valtype=EnsureBool()))
        except Exception as e:
            lgr.error('Failed to get dataset metadata ({}): {}'.format(
                mtype, exc_str(e)))
            if cfg.get('datalad.runtime.raiseonerror'):
                log_progress(
                    lgr.error,
                    'metadataextractors',
                    'Failed %s metadata extraction from %s',
                    mtype_key,
                    ds,
                )
                raise
            errored = True
            # if we dont get global metadata we do not want content metadata
            continue

        if dsmeta_t:
            if _ok_metadata(dsmeta_t, mtype, ds, None):
                dsmeta_t = _filter_metadata_fields(dsmeta_t,
                                                   maxsize=max_fieldsize,
                                                   blacklist=blacklist)
                dsmeta[mtype_key] = dsmeta_t
            else:
                errored = True

        unique_cm = {}
        extractor_unique_exclude = getattr(extractor_cls, "_unique_exclude",
                                           set())
        # TODO: ATM neuroimaging extractors all provide their own internal
        #  log_progress but if they are all generators, we could provide generic
        #  handling of the progress here.  Note also that log message is actually
        #  seems to be ignored and not used, only the label ;-)
        # log_progress(
        #     lgr.debug,
        #     'metadataextractors_loc',
        #     'Metadata extraction per location for %s', mtype,
        #     # contentmeta_t is a generator... so no cound is known
        #     # total=len(contentmeta_t or []),
        #     label='Metadata extraction per location',
        #     unit=' locations',
        # )
        for loc, meta in contentmeta_t or {}:
            lgr.log(5, "Analyzing metadata for %s", loc)
            # log_progress(
            #     lgr.debug,
            #     'metadataextractors_loc',
            #     'ignoredatm',
            #     label=loc,
            #     update=1,
            #     increment=True)
            if not _ok_metadata(meta, mtype, ds, loc):
                errored = True
                # log_progress(
                #     lgr.debug,
                #     'metadataextractors_loc',
                #     'ignoredatm',
                #     label='Failed for %s' % loc,
                # )
                continue
            # we also want to store info that there was no metadata(e.g. to get a list of
            # files that have no metadata)
            # if there is an issue that a extractor needlessly produces empty records, the
            # extractor should be fixed and not a general switch. For example the datalad_core
            # issues empty records to document the presence of a file
            #elif not meta:
            #    continue

            # apply filters
            meta = _filter_metadata_fields(meta,
                                           maxsize=max_fieldsize,
                                           blacklist=blacklist)

            if not meta:
                continue

            # assign
            # only ask each metadata extractor once, hence no conflict possible
            loc_dict = contentmeta.get(loc, {})
            loc_dict[mtype_key] = meta
            contentmeta[loc] = loc_dict

            if ds.config.obtain('datalad.metadata.generate-unique-{}'.format(
                    mtype_key.replace('_', '-')),
                                default=True,
                                valtype=EnsureBool()):
                # go through content metadata and inject report of unique keys
                # and values into `dsmeta`
                for k, v in meta.items():
                    if k in dsmeta.get(mtype_key, {}):
                        # if the dataset already has a dedicated idea
                        # about a key, we skip it from the unique list
                        # the point of the list is to make missing info about
                        # content known in the dataset, not to blindly
                        # duplicate metadata. Example: list of samples data
                        # were recorded from. If the dataset has such under
                        # a 'sample' key, we should prefer that, over an
                        # aggregated list of a hopefully-kinda-ok structure
                        continue
                    elif k in extractor_unique_exclude:
                        # the extractor thinks this key is worthless for the purpose
                        # of discovering whole datasets
                        # we keep the key (so we know that some file is providing this key),
                        # but ignore any value it came with
                        unique_cm[k] = None
                        continue
                    vset = unique_cm.get(k, set())
                    vset.add(_val2hashable(v))
                    unique_cm[k] = vset

        # log_progress(
        #     lgr.debug,
        #     'metadataextractors_loc',
        #     'Finished metadata extraction across locations for %s', mtype)

        if unique_cm:
            # per source storage here too
            ucp = dsmeta.get('datalad_unique_content_properties', {})

            # important: we want to have a stable order regarding
            # the unique values (a list). we cannot guarantee the
            # same order of discovery, hence even when not using a
            # set above we would still need sorting. the callenge
            # is that any value can be an arbitrarily complex nested
            # beast
            # we also want to have each unique value set always come
            # in a top-level list, so we known if some unique value
            # was a list, os opposed to a list of unique values

            def _ensure_serializable(val):
                if isinstance(val, ReadOnlyDict):
                    return {k: _ensure_serializable(v) for k, v in val.items()}
                if isinstance(val, (tuple, list)):
                    return [_ensure_serializable(v) for v in val]
                else:
                    return val

            ucp[mtype_key] = {
                k: [
                    _ensure_serializable(i)
                    for i in sorted(v, key=_unique_value_key)
                ] if v is not None else None
                for k, v in unique_cm.items()
                # v == None (disable unique, but there was a value at some point)
                # otherwise we only want actual values, and also no single-item-lists
                # of a non-value
                # those contribute no information, but bloat the operation
                # (inflated number of keys, inflated storage, inflated search index, ...)
                if v is None or (v and not v == {''})
            }
            dsmeta['datalad_unique_content_properties'] = ucp

    log_progress(
        lgr.info,
        'metadataextractors',
        'Finished metadata extraction from %s',
        ds,
    )

    # always identify the effective vocabulary - JSON-LD style
    if context:
        dsmeta['@context'] = context

    return dsmeta, contentmeta, errored