Beispiel #1
0
    def __call__(self, query, max_nresults=None, force_reindex=False):
        with self.idx_obj.searcher() as searcher:
            wquery = self.get_query(query)

            # perform the actual search
            hits = searcher.search(
                wquery,
                terms=True,
                limit=max_nresults if max_nresults > 0 else None)
            # report query stats
            topstr = '{} top {}'.format(
                max_nresults,
                single_or_plural('match', 'matches', max_nresults)
            )
            lgr.info('Query completed in {} sec.{}'.format(
                hits.runtime,
                ' Reporting {}.'.format(
                    ('up to ' + topstr)
                    if max_nresults > 0
                    else 'all matches'
                )
                if not hits.is_empty()
                else ' No matches.'
            ))

            if not hits:
                return

            nhits = 0
            # annotate hits for full metadata report
            hits = [dict(
                path=normpath(opj(self.ds.path, hit['path'])),
                query_matched={assure_unicode(k): assure_unicode(v)
                               if isinstance(v, unicode_srctypes) else v
                               for k, v in hit.matched_terms()},
                parentds=normpath(
                    opj(self.ds.path, hit['parentds'])) if 'parentds' in hit else None,
                type=hit.get('type', None))
                for hit in hits]
            for res in query_aggregated_metadata(
                    # type is taken from hit record
                    reporton=None,
                    ds=self.ds,
                    aps=hits,
                    # never recursive, we have direct hits already
                    recursive=False):
                res.update(
                    refds=self.ds.path,
                    action='search',
                    status='ok',
                    logger=lgr,
                )
                yield res
                nhits += 1

            if max_nresults and nhits == max_nresults:
                lgr.info(
                    "Reached the limit of {}, there could be more which "
                    "were not reported.".format(topstr)
                )
Beispiel #2
0
    def __call__(self,
                 query,
                 max_nresults=None,
                 consider_ucn=False,
                 full_record=True):
        query_re = re.compile(self.get_query(query))

        nhits = 0
        for res in query_aggregated_metadata(
                reporton=self.documenttype,
                ds=self.ds,
                aps=[dict(path=self.ds.path, type='dataset')],
                # MIH: I cannot see a case when we would not want recursion (within
                # the metadata)
                recursive=True):
            # this assumes that files are reported after each dataset report,
            # and after a subsequent dataset report no files for the previous
            # dataset will be reported again
            meta = res.get('metadata', {})
            # produce a flattened metadata dict to search through
            doc = _meta2autofield_dict(meta,
                                       val2str=True,
                                       consider_ucn=consider_ucn)
            # use search instead of match to not just get hits at the start of the string
            # this will be slower, but avoids having to use actual regex syntax at the user
            # side even for simple queries
            # DOTALL is needed to handle multiline description fields and such, and still
            # be able to match content coming for a later field
            lgr.log(7, "Querying %s among %d items", query_re, len(doc))
            t0 = time()
            matches = {
                k: query_re.search(v.lower())
                for k, v in iteritems(doc)
            }
            dt = time() - t0
            lgr.log(7, "Finished querying in %f sec", dt)
            # retain what actually matched
            matches = {
                k: match.group()
                for k, match in matches.items() if match
            }
            if matches:
                hit = dict(
                    res,
                    action='search',
                    query_matched=matches,
                )
                yield hit
                nhits += 1
                if max_nresults and nhits == max_nresults:
                    # report query stats
                    topstr = '{} top {}'.format(
                        max_nresults,
                        single_or_plural('match', 'matches', max_nresults))
                    lgr.info(
                        "Reached the limit of {}, there could be more which "
                        "were not reported.".format(topstr))
                    break
Beispiel #3
0
    def _mk_schema(self, dsinfo):
        from whoosh import fields as wf
        from whoosh.analysis import SimpleAnalyzer

        # haven for terms that have been found to be undefined
        # (for faster decision-making upon next encounter)
        # this will harvest all discovered term definitions
        definitions = {
            '@id': 'unique identifier of an entity',
            # TODO make proper JSON-LD definition
            'path':
            'path name of an entity relative to the searched base dataset',
            # TODO make proper JSON-LD definition
            'parentds': 'path of the datasets that contains an entity',
            # 'type' will not come from a metadata field, hence will not be detected
            'type': 'type of a record',
        }

        schema_fields = {
            n.lstrip('@'): wf.ID(stored=True, unique=n == '@id')
            for n in definitions
        }

        lgr.debug('Scanning for metadata keys')
        # quick 1st pass over all dataset to gather the needed schema fields
        log_progress(
            lgr.info,
            'idxschemabuild',
            'Start building search schema',
            total=len(dsinfo),
            label='Building search schema',
            unit=' Datasets',
        )
        for res in query_aggregated_metadata(
                # XXX TODO After #2156 datasets may not necessarily carry all
                # keys in the "unique" summary
                reporton='datasets',
                ds=self.ds,
                aps=[dict(path=self.ds.path, type='dataset')],
                recursive=True):
            meta = res.get('metadata', {})
            # no stringification of values for speed, we do not need/use the
            # actual values at this point, only the keys
            idxd = _meta2autofield_dict(meta, val2str=False)

            for k in idxd:
                schema_fields[k] = wf.TEXT(stored=False,
                                           analyzer=SimpleAnalyzer())
            log_progress(lgr.info,
                         'idxschemabuild',
                         'Scanned dataset at %s',
                         res['path'],
                         update=1,
                         increment=True)
        log_progress(lgr.info, 'idxschemabuild', 'Done building search schema')

        self.schema = wf.Schema(**schema_fields)
Beispiel #4
0
    def _get_keys(self, mode=None):
        """Return keys and their statistics if mode != 'name'."""
        class key_stat:
            def __init__(self):
                self.ndatasets = 0  # how many datasets have this field
                self.uvals = set()

        from collections import defaultdict
        keys = defaultdict(key_stat)
        for res in query_aggregated_metadata(
                # XXX TODO After #2156 datasets may not necessarily carry all
                # keys in the "unique" summary
                reporton='datasets',
                ds=self.ds,
                aps=[dict(path=self.ds.path, type='dataset')],
                recursive=True):
            meta = res.get('metadata', {})
            # inject a few basic properties into the dict
            # analog to what the other modes do in their index
            meta.update({
                k: res.get(k, None) for k in ('@id', 'type', 'path', 'parentds')
                # parentds is tricky all files will have it, but the dataset
                # queried above might not (single dataset), let's force it in
                if k == 'parentds' or k in res})

            # no stringification of values for speed
            idxd = _meta2autofield_dict(meta, val2str=False)

            for k, kvals in idxd.items():
                # TODO deal with conflicting definitions when available
                keys[k].ndatasets += 1
                if mode == 'name':
                    continue
                try:
                    kvals_set = assure_iter(kvals, set)
                except TypeError:
                    # TODO: may be do show hashable ones???
                    nunhashable = sum(
                        isinstance(x, collections.Hashable) for x in kvals
                    )
                    kvals_set = {
                        'unhashable %d out of %d entries'
                        % (nunhashable, len(kvals))
                    }
                keys[k].uvals |= kvals_set
        return keys
Beispiel #5
0
    def show_keys(self):
        # use a dict already, later we need to map to a definition
        keys = {}
        for res in query_aggregated_metadata(
                # XXX TODO After #2156 datasets may not necessarily carry all
                # keys in the "unique" summary
                reporton='datasets',
                ds=self.ds,
                aps=[dict(path=self.ds.path, type='dataset')],
                recursive=True):
            meta = res.get('metadata', {})
            # no stringification of values for speed
            idxd = _meta2autofield_dict(meta, val2str=False)

            for k in idxd:
                # TODO deal with conflicting definitions when available
                keys[k] = None
        for k in sorted(keys):
            print(k)
Beispiel #6
0
def test_aggregation(path=None):
    with chpwd(path):
        assert_raises(InsufficientArgumentsError, aggregate_metadata, None)
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    # before anything aggregated we would get nothing and only a log warning
    with swallow_logs(new_level=logging.WARNING) as cml:
        assert_equal(list(query_aggregated_metadata('all', ds, [])), [])
    assert_re_in('.*Found no aggregated metadata.*update', cml.out)
    ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                  scope='branch')
    subds = ds.create('sub', force=True)
    subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                     scope='branch')
    subsubds = subds.create('subsub', force=True)
    subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                        scope='branch')
    ds.save(recursive=True)
    assert_repo_status(ds.path)
    # aggregate metadata from all subdatasets into any superdataset, including
    # intermediate ones
    res = ds.aggregate_metadata(recursive=True, update_mode='all')
    # we get success report for both subdatasets and the superdataset,
    # and they get saved
    assert_result_count(res, 3, status='ok', action='aggregate_metadata')
    assert_in_results(res, action='save', status="ok")
    # nice and tidy
    assert_repo_status(ds.path)

    # quick test of aggregate report
    aggs = ds.metadata(get_aggregates=True)
    # one for each dataset
    assert_result_count(aggs, 3)
    # mother also report layout version
    assert_result_count(aggs, 1, path=ds.path, layout_version=1)

    # store clean direct result
    origres = ds.metadata(recursive=True)
    # basic sanity check
    assert_result_count(origres, 6)
    assert_result_count(origres, 3, type='dataset')
    assert_result_count(origres, 3, type='file')  # Now that we have annex.key
    # three different IDs
    assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset'])))
    # and we know about all three datasets
    for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'):
        assert_true(
            sum([s['metadata']['frictionless_datapackage']['name'] \
                    == ensure_unicode(name) for s in origres
                 if s['type'] == 'dataset']))

    # now clone the beast to simulate a new user installing an empty dataset
    clone = install(
        opj(path, 'clone'), source=ds.path,
        result_xfm='datasets', return_type='item-or-list')
    # ID mechanism works
    assert_equal(ds.id, clone.id)

    # get fresh metadata
    cloneres = clone.metadata()
    # basic sanity check
    assert_result_count(cloneres, 2)
    assert_result_count(cloneres, 1, type='dataset')
    assert_result_count(cloneres, 1, type='file')

    # now loop over the previous results from the direct metadata query of
    # origin and make sure we get the extract same stuff from the clone
    _compare_metadata_helper(origres, clone)

    # now obtain a subdataset in the clone, should make no difference
    assert_status('ok', clone.install('sub', result_xfm=None, return_type='list'))
    _compare_metadata_helper(origres, clone)

    # test search in search tests, not all over the place
    ## query smoke test
    assert_result_count(clone.search('mother', mode='egrep'), 1)
    assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1)

    child_res = clone.search('child', mode='egrep')
    assert_result_count(child_res, 2)
    for r in child_res:
        if r['type'] == 'dataset':
            assert_in(
                r['query_matched']['frictionless_datapackage.name'],
                r['metadata']['frictionless_datapackage']['name'])
Beispiel #7
0
    def show_keys(self, mode=None):
        maxl = 100  # maximal line length for unique values in mode=short

        # use a dict already, later we need to map to a definition
        # meanwhile map to the values

        class key_stat:
            def __init__(self):
                self.ndatasets = 0  # how many datasets have this field
                self.uvals = set()

        from collections import defaultdict
        keys = defaultdict(key_stat)

        for res in query_aggregated_metadata(
                # XXX TODO After #2156 datasets may not necessarily carry all
                # keys in the "unique" summary
                reporton='datasets',
                ds=self.ds,
                aps=[dict(path=self.ds.path, type='dataset')],
                recursive=True):
            meta = res.get('metadata', {})
            # inject a few basic properties into the dict
            # analog to what the other modes do in their index
            meta.update({
                k: res.get(k, None)
                for k in ('@id', 'type', 'path', 'parentds')
                # parentds is tricky all files will have it, but the dataset
                # queried above might not (single dataset), let's force it in
                if k == 'parentds' or k in res
            })

            # no stringification of values for speed
            idxd = _meta2autofield_dict(meta, val2str=False)

            for k, kvals in iteritems(idxd):
                # TODO deal with conflicting definitions when available
                keys[k].ndatasets += 1
                if mode == 'name':
                    continue
                try:
                    kvals_set = assure_iter(kvals, set)
                except TypeError:
                    # TODO: may be do show hashable ones???
                    nunhashable = sum(
                        isinstance(x, collections.Hashable) for x in kvals)
                    kvals_set = {
                        'unhashable %d out of %d entries' %
                        (nunhashable, len(kvals))
                    }
                keys[k].uvals |= kvals_set

        for k in sorted(keys):
            if mode == 'name':
                print(k)
                continue

            # do a bit more
            stat = keys[k]
            uvals = stat.uvals
            if mode == 'short':
                # show only up to X uvals
                if len(stat.uvals) > 10:
                    uvals = {v for i, v in enumerate(uvals) if i < 10}
            # all unicode still scares yoh -- he will just use repr
            # def conv(s):
            #     try:
            #         return '{}'.format(s)
            #     except UnicodeEncodeError:
            #         return assure_unicode(s).encode('utf-8')
            stat.uvals_str = assure_unicode("{} unique values: {}".format(
                len(stat.uvals), ', '.join(map(repr, uvals))))
            if mode == 'short':
                if len(stat.uvals) > 10:
                    stat.uvals_str += ', ...'
                if len(stat.uvals_str) > maxl:
                    stat.uvals_str = stat.uvals_str[:maxl - 4] + ' ....'
            elif mode == 'full':
                pass
            else:
                raise ValueError(
                    "Unknown value for stats. Know full and short")

            print('{k}\n in  {stat.ndatasets} datasets\n has {stat.uvals_str}'.
                  format(k=k, stat=stat))
Beispiel #8
0
    def __call__(self,
                 query,
                 max_nresults=None,
                 consider_ucn=False,
                 full_record=True):
        if max_nresults is None:
            # no limit by default
            max_nresults = 0
        query = self.get_query(query)

        nhits = 0
        for res in query_aggregated_metadata(
                reporton=self.documenttype,
                ds=self.ds,
                aps=[dict(path=self.ds.path, type='dataset')],
                # MIH: I cannot see a case when we would not want recursion (within
                # the metadata)
                recursive=True):
            # this assumes that files are reported after each dataset report,
            # and after a subsequent dataset report no files for the previous
            # dataset will be reported again
            meta = res.get('metadata', {})
            # produce a flattened metadata dict to search through
            doc = _meta2autofield_dict(meta,
                                       val2str=True,
                                       consider_ucn=consider_ucn)
            # inject a few basic properties into the dict
            # analog to what the other modes do in their index
            doc.update({
                k: res[k]
                for k in ('@id', 'type', 'path', 'parentds') if k in res
            })
            # use search instead of match to not just get hits at the start of the string
            # this will be slower, but avoids having to use actual regex syntax at the user
            # side even for simple queries
            # DOTALL is needed to handle multiline description fields and such, and still
            # be able to match content coming for a later field
            lgr.log(7, "Querying %s among %d items", query, len(doc))
            t0 = time()
            matches = {
                (q['query'] if isinstance(q, dict) else q, k):
                q['query'].search(v) if isinstance(q, dict) else q.search(v)
                for k, v in iteritems(doc) for q in query
                if not isinstance(q, dict) or q['field'].match(k)
            }
            dt = time() - t0
            lgr.log(7, "Finished querying in %f sec", dt)
            # retain what actually matched
            matched = {
                k[1]: match.group()
                for k, match in matches.items() if match
            }
            # implement AND behavior across query expressions, but OR behavior
            # across queries matching multiple fields for a single query expression
            # for multiple queries, this makes it consistent with a query that
            # has no field specification
            if matched and len(query) == len(
                    set(k[0] for k in matches if matches[k])):
                hit = dict(
                    res,
                    action='search',
                    query_matched=matched,
                )
                yield hit
                nhits += 1
                if max_nresults and nhits == max_nresults:
                    # report query stats
                    topstr = '{} top {}'.format(
                        max_nresults,
                        single_or_plural('match', 'matches', max_nresults))
                    lgr.info(
                        "Reached the limit of {}, there could be more which "
                        "were not reported.".format(topstr))
                    break
Beispiel #9
0
    def _mk_search_index(self, force_reindex):
        """Generic entrypoint to index generation

        The actual work that determines the structure and content of the index
        is done by functions that are passed in as arguments

        `meta2doc` - must return dict for index document from result input
        """
        from whoosh import index as widx
        from .metadata import agginfo_relpath
        # what is the lastest state of aggregated metadata
        metadata_state = self.ds.repo.get_last_commit_hash(agginfo_relpath)
        # use location common to all index types, they would all invalidate
        # simultaneously
        stamp_fname = opj(self.index_dir, 'datalad_metadata_state')
        index_dir = opj(self.index_dir, self._mode_label)

        if (not force_reindex) and \
                exists(index_dir) and \
                exists(stamp_fname) and \
                open(stamp_fname).read() == metadata_state:
            try:
                # TODO check that the index schema is the same
                # as the one we would have used for reindexing
                # TODO support incremental re-indexing, whoosh can do it
                idx = widx.open_dir(index_dir)
                lgr.debug('Search index contains %i documents',
                          idx.doc_count())
                self.idx_obj = idx
                return
            except widx.LockError as e:
                raise e
            except widx.IndexError as e:
                # Generic index error.
                # we try to regenerate
                lgr.warning(
                    "Cannot open existing index %s (%s), will regenerate",
                    index_dir, exc_str(e))
            except widx.IndexVersionError as e:  # (msg, version, release=None)
                # Raised when you try to open an index using a format that the
                # current version of Whoosh cannot read. That is, when the index
                # you're trying to open is either not backward or forward
                # compatible with this version of Whoosh.
                # we try to regenerate
                lgr.warning(exc_str(e))
                pass
            except widx.OutOfDateError as e:
                # Raised when you try to commit changes to an index which is not
                # the latest generation.
                # this should not happen here, but if it does ... KABOOM
                raise
            except widx.EmptyIndexError as e:
                # Raised when you try to work with an index that has no indexed
                # terms.
                # we can just continue with generating an index
                pass
            except ValueError as e:
                if 'unsupported pickle protocol' in str(e):
                    lgr.warning(
                        "Cannot open existing index %s (%s), will regenerate",
                        index_dir, exc_str(e))
                else:
                    raise

        lgr.info('{} search index'.format(
            'Rebuilding' if exists(index_dir) else 'Building'))

        if not exists(index_dir):
            os.makedirs(index_dir)

        # this is a pretty cheap call that just pull this info from a file
        dsinfo = self.ds.metadata(get_aggregates=True,
                                  return_type='list',
                                  result_renderer='disabled')

        self._mk_schema(dsinfo)

        idx_obj = widx.create_in(index_dir, self.schema)
        idx = idx_obj.writer(
            # cache size per process
            limitmb=cfg.obtain('datalad.search.indexercachesize'),
            # disable parallel indexing for now till #1927 is resolved
            ## number of processes for indexing
            #procs=multiprocessing.cpu_count(),
            ## write separate index segments in each process for speed
            ## asks for writer.commit(optimize=True)
            #multisegment=True,
        )

        # load metadata of the base dataset and what it knows about all its subdatasets
        # (recursively)
        old_idx_size = 0
        old_ds_rpath = ''
        idx_size = 0
        log_progress(
            lgr.info,
            'autofieldidxbuild',
            'Start building search index',
            total=len(dsinfo),
            label='Building search index',
            unit=' Datasets',
        )
        for res in query_aggregated_metadata(
                reporton=self.documenttype,
                ds=self.ds,
                aps=[dict(path=self.ds.path, type='dataset')],
                # MIH: I cannot see a case when we would not want recursion (within
                # the metadata)
                recursive=True):
            # this assumes that files are reported after each dataset report,
            # and after a subsequent dataset report no files for the previous
            # dataset will be reported again
            meta = res.get('metadata', {})
            doc = self._meta2doc(meta)
            admin = {
                'type': res['type'],
                'path': relpath(res['path'], start=self.ds.path),
            }
            if 'parentds' in res:
                admin['parentds'] = relpath(res['parentds'],
                                            start=self.ds.path)
            if admin['type'] == 'dataset':
                if old_ds_rpath:
                    lgr.debug(
                        'Added %s on dataset %s',
                        single_or_plural('document',
                                         'documents',
                                         idx_size - old_idx_size,
                                         include_count=True), old_ds_rpath)
                log_progress(lgr.info,
                             'autofieldidxbuild',
                             'Indexed dataset at %s',
                             old_ds_rpath,
                             update=1,
                             increment=True)
                old_idx_size = idx_size
                old_ds_rpath = admin['path']
                admin['id'] = res.get('dsid', None)

            doc.update({k: assure_unicode(v) for k, v in admin.items()})
            lgr.debug("Adding document to search index: {}".format(doc))
            # inject into index
            idx.add_document(**doc)
            idx_size += 1

        if old_ds_rpath:
            lgr.debug(
                'Added %s on dataset %s',
                single_or_plural('document',
                                 'documents',
                                 idx_size - old_idx_size,
                                 include_count=True), old_ds_rpath)

        lgr.debug("Committing index")
        idx.commit(optimize=True)
        log_progress(lgr.info, 'autofieldidxbuild',
                     'Done building search index')

        # "timestamp" the search index to allow for automatic invalidation
        with open(stamp_fname, 'w') as f:
            f.write(metadata_state)

        lgr.info('Search index contains %i documents', idx_size)
        self.idx_obj = idx_obj
Beispiel #10
0
def test_aggregation(path):
    with chpwd(path):
        assert_raises(InsufficientArgumentsError, aggregate_metadata, None)
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    # before anything aggregated we would get nothing and only a log warning
    with swallow_logs(new_level=logging.WARNING) as cml:
        assert_equal(list(query_aggregated_metadata('all', ds, [])), [])
    assert_re_in('.*Found no aggregated metadata.*update', cml.out)
    ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                  where='dataset')
    subds = ds.create('sub', force=True)
    subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                     where='dataset')
    subsubds = subds.create('subsub', force=True)
    subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                        where='dataset')
    ds.add('.', recursive=True)
    ok_clean_git(ds.path)
    # aggregate metadata from all subdatasets into any superdataset, including
    # intermediate ones
    res = ds.aggregate_metadata(recursive=True, update_mode='all')
    # we get success report for both subdatasets and the superdataset,
    # and they get saved
    assert_result_count(res, 6)
    assert_result_count(res, 3, status='ok', action='aggregate_metadata')
    assert_result_count(res, 3, status='ok', action='save')
    # nice and tidy
    ok_clean_git(ds.path)

    # quick test of aggregate report
    aggs = ds.metadata(get_aggregates=True)
    # one for each dataset
    assert_result_count(aggs, 3)
    # mother also report layout version
    assert_result_count(aggs, 1, path=ds.path, layout_version=1)

    # store clean direct result
    origres = ds.metadata(recursive=True)
    # basic sanity check
    assert_result_count(origres, 6)
    assert_result_count(origres, 3, type='dataset')
    assert_result_count(origres, 3, type='file')  # Now that we have annex.key
    # three different IDs
    assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset'])))
    # and we know about all three datasets
    for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'):
        assert_true(
            sum([s['metadata']['frictionless_datapackage']['name'] \
                    == assure_unicode(name) for s in origres
                 if s['type'] == 'dataset']))

    # now clone the beast to simulate a new user installing an empty dataset
    clone = install(
        opj(path, 'clone'), source=ds.path,
        result_xfm='datasets', return_type='item-or-list')
    # ID mechanism works
    assert_equal(ds.id, clone.id)

    # get fresh metadata
    cloneres = clone.metadata()
    # basic sanity check
    assert_result_count(cloneres, 2)
    assert_result_count(cloneres, 1, type='dataset')
    assert_result_count(cloneres, 1, type='file')

    # now loop over the previous results from the direct metadata query of
    # origin and make sure we get the extact same stuff from the clone
    _compare_metadata_helper(origres, clone)

    # now obtain a subdataset in the clone, should make no difference
    assert_status('ok', clone.install('sub', result_xfm=None, return_type='list'))
    _compare_metadata_helper(origres, clone)

    # test search in search tests, not all over the place
    ## query smoke test
    assert_result_count(clone.search('mother', mode='egrep'), 1)
    assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1)

    child_res = clone.search('child', mode='egrep')
    assert_result_count(child_res, 2)
    for r in child_res:
        if r['type'] == 'dataset':
            assert_in(
                r['query_matched']['frictionless_datapackage.name'],
                r['metadata']['frictionless_datapackage']['name'])
Beispiel #11
0
def _get_search_index(index_dir, ds, force_reindex):
    from whoosh import index as widx
    from .metadata import agginfo_relpath
    # what is the lastest state of aggregated metadata
    metadata_state = ds.repo.get_last_commit_hash(agginfo_relpath)
    stamp_fname = opj(index_dir, 'datalad_metadata_state')
    definitions_fname = opj(index_dir, 'datalad_term_definitions.json.gz')

    if not force_reindex and \
            exists(stamp_fname) and \
            open(stamp_fname).read() == metadata_state:
        try:
            # TODO check that the index schema is the same
            # as the one we would have used for reindexing
            # TODO support incremental re-indexing, whoosh can do it
            idx = widx.open_dir(index_dir)
            lgr.debug('Search index contains %i documents', idx.doc_count())
            return idx
        except widx.LockError as e:
            raise e
        except widx.IndexError as e:
            # Generic index error.
            # we try to regenerate
            # TODO log this
            pass
        except widx.IndexVersionError as e:  # (msg, version, release=None)
            # Raised when you try to open an index using a format that the
            # current version of Whoosh cannot read. That is, when the index
            # you're trying to open is either not backward or forward
            # compatible with this version of Whoosh.
            # we try to regenerate
            lgr.warning(exc_str(e))
            pass
        except widx.OutOfDateError as e:
            # Raised when you try to commit changes to an index which is not
            # the latest generation.
            # this should not happen here, but if it does ... KABOOM
            raise e
        except widx.EmptyIndexError as e:
            # Raised when you try to work with an index that has no indexed
            # terms.
            # we can just continue with generating an index
            pass

    lgr.info('{} search index'.format(
        'Rebuilding' if exists(index_dir) else 'Building'))

    if not exists(index_dir):
        os.makedirs(index_dir)

    schema, definitions, per_ds_defs = _get_search_schema(ds)

    idx_obj = widx.create_in(index_dir, schema)
    idx = idx_obj.writer(
        # cache size per process
        limitmb=cfg.obtain('datalad.search.indexercachesize'),
        # disable parallel indexing for now till #1927 is resolved
        ## number of processes for indexing
        #procs=multiprocessing.cpu_count(),
        ## write separate index segments in each process for speed
        ## asks for writer.commit(optimize=True)
        #multisegment=True,
    )

    # load metadata of the base dataset and what it knows about all its subdatasets
    # (recursively)
    old_idx_size = 0
    old_ds_rpath = ''
    idx_size = 0
    for res in query_aggregated_metadata(
            reporton=ds.config.obtain(
                'datalad.metadata.searchindex-documenttype'),
            ds=ds,
            aps=[dict(path=ds.path, type='dataset')],
            # TODO expose? but this would likely only affect metadata in the
            # base dataset
            merge_mode='init',
            # MIH: I cannot see a case when we would not want recursion (within
            # the metadata)
            recursive=True):
        rpath = relpath(res['path'], start=ds.path)
        # this assumes that files are reported after each dataset report,
        # and after a subsequent dataset report no files for the previous
        # dataset will be reported again
        rtype = res['type']
        meta = res.get('metadata', {})
        meta = MetadataDict(meta)
        if rtype == 'dataset':
            if old_ds_rpath:
                lgr.info(
                    'Added %s on dataset %s',
                    single_or_plural('document',
                                     'documents',
                                     idx_size - old_idx_size,
                                     include_count=True), old_ds_rpath)
            old_idx_size = idx_size
            old_ds_rpath = rpath

            # get any custom dataset mappings
            ds_defs = per_ds_defs.get(res['path'], {})
            # now we merge all reported unique content properties (flattened representation
            # of content metadata) with the main metadata set, using the 'add' strategy
            # this way any existing metadata value of a dataset itself will be amended by
            # those coming from the content. E.g. a single dataset 'license' might be turned
            # into a sequence of unique license identifiers across all dataset components
            meta.merge_add(meta.get('unique_content_properties', {}))
            meta.pop('unique_content_properties', None)
        doc_props = dict(path=rpath,
                         type=rtype,
                         **_meta2index_dict(meta, definitions, ds_defs))
        if 'parentds' in res:
            doc_props['parentds'] = relpath(res['parentds'], start=ds.path)
        _add_document(idx, **doc_props)
        idx_size += 1

    if old_ds_rpath:
        lgr.info(
            'Added %s on dataset %s',
            single_or_plural('document',
                             'documents',
                             idx_size - old_idx_size,
                             include_count=True), old_ds_rpath)

    lgr.debug("Committing index")
    idx.commit(optimize=True)

    # "timestamp" the search index to allow for automatic invalidation
    with open(stamp_fname, 'w') as f:
        f.write(metadata_state)

    # dump the term/field definitions records for later introspection
    # use compressed storage, the is not point in inflating the
    # diskspace requirements
    lgr.debug("Storing definitions to %s", definitions_fname)
    with gzopen(definitions_fname, 'wb') as f:
        # TODO actually go through all, incl. compound, defintions ('@id' plus 'unit'
        # or similar) and resolve terms to URLs, if anyhow possible
        jsondump2file(definitions, f)

    lgr.info('Search index contains %i documents', idx_size)
    return idx_obj
Beispiel #12
0
def _get_search_schema(ds):
    from whoosh import fields as wf

    # haven for terms that have been found to be undefined
    # (for faster decision-making upon next encounter)
    undef = set()
    # this will harvest all discovered term definitions
    definitions = {
        '@id': 'unique identifier of an entity',
        # TODO make proper JSON-LD definition
        'path': 'path name of an entity relative to the searched base dataset',
        # TODO make proper JSON-LD definition
        'parentds': 'path of the datasets that contains an entity',
        # 'type' will not come from a metadata field, hence will not be detected
        'type': {
            '@id':
            _resolve_term(common_defs['type']['def'], {}, common_defs, undef),
            'description':
            common_defs['type']['descr']
        },
    }

    schema_fields = {
        n: wf.ID(stored=True, unique=n == '@id')
        for n in definitions
    }
    # this will contain any dataset-specific term mappings, in case we find
    # non-unique keys that are differently defined
    per_ds_defs = {}
    ds_defs = {}

    lgr.info('Scanning for metadata keys')
    # quick 1st pass over all dataset to gather the needed schema fields
    # sanitization of / should ideally be done while saving, but that would require
    # fixes in whoosh I guess
    sanitize_key = lambda k: k.replace(' ', '_').replace('/', '_')
    for res in query_aggregated_metadata(
            reporton='datasets',
            ds=ds,
            aps=[dict(path=ds.path, type='dataset')],
            merge_mode='init',
            recursive=True):
        ds_defs = {}
        meta = res.get('metadata', {})
        for k, v in meta.get('@context', {}).items():
            k = sanitize_key(k)
            if k not in definitions or definitions[k] == v:
                # this is new, but unique, or uniformly defined
                definitions[k] = v
            else:
                # non-unique key (across all seen datasets)
                # make unique
                # TODO we have to deal with @vocab fields in here, those
                # might be different when some aggregated metadata was
                # generated with an old version of datalad
                # in this case we should actually load the old vocabulary
                #set.add(', '.join(i for i in v) if isinstance(v, (tuple, list)) else v)
                # and perform the mapping to the current one in here
                count = 0
                uk = k
                while uk in definitions:
                    if definitions[uk] == v:
                        break  # already exists and matches
                    count += 1
                    uk = '{}_{}'.format(k, count)
                ds_defs[k] = k = uk
            definitions[k] = v
            # we register a field for any definition in the context.
            # while this has the potential to needlessly blow up the
            # index size, the only alternative would be to iterate over
            # all content metadata in this first pass too, in order to
            # do a full scan.
            if k == '@vocab' or isinstance(v, dict) and v.get(
                    'type', None) == vocabulary_id:
                continue
            schema_fields[k] = wf.TEXT(stored=True)
        if ds_defs:
            # store ds-specific mapping for the second pass that actually
            # generates the search index
            per_ds_defs[res['path']] = ds_defs

        # anything that is a direct metadata key or is reported as being a content metadata
        # key is a valid candidate for inclusion into the schema
        cand_keys = list(meta)
        cand_keys.extend(meta.get('unique_content_properties', []))
        # need a copy, we are going to reformat keys of ad-hoc defs
        final_defs = dict(definitions)
        for k in cand_keys:
            k = sanitize_key(k)
            if k in ('unique_content_properties', '@context'):
                # those are just means for something else and irrelevant
                # for searches
                continue
            # check if we have any kind of definitions for this key
            if k not in definitions:
                termdef = _resolve_term(k, definitions, common_defs, undef)
                if termdef is None:
                    # we know nothing about this key, ignore
                    lgr.debug("Ignoring term '%s', no definition found", k)
                    continue
                final_defs[k] = termdef
                # TODO treat keywords/tags separately
                schema_fields[k] = wf.TEXT(stored=True)
            else:
                if isinstance(definitions[k], dict):
                    final_defs[k] = {
                        k_ if k_ == '@id' else '{} ({})'.format(
                            k_,
                            _resolve_term(k_, definitions, common_defs,
                                          undef)):
                        _resolve_term(v, definitions, common_defs, undef)
                        if k_ in ('@id', 'unit') else v
                        for k_, v in definitions[k].items()
                        if v  # skip if value is empty
                    }

    schema = wf.Schema(**schema_fields)
    return schema, final_defs, per_ds_defs
Beispiel #13
0
def extract(ds, output_directory, repository_info=None):
    if pd is None:
        lgr.error(
            "This plugin requires Pandas to be available (error follows)")
        import pandas
        return

    # collect infos about dataset and ISATAB structure for use in investigator
    # template
    info = {}
    if not exists(output_directory):
        lgr.info("creating output directory at '{}'".format(output_directory))
        os.makedirs(output_directory)

    # pull out everything we know about any file in the dataset, and the dataset
    # itself
    metadb = {
        relpath(r['path'], ds.path): r.get('metadata', {})
        for r in query_aggregated_metadata(
            'all', ds, [dict(path=ds.path, type='dataset')], 'init')
    }

    # prep for assay table info
    protocols = OrderedDict()
    for prop in assay_props:
        info[prop] = []

    # pull out essential metadata bits about the dataset itself
    # for study description)
    dsmeta = metadb.get('.', {})
    info['name'] = dsmeta.get('shortdescription', dsmeta.get('name', 'TODO'))
    info['author'] = '\t'.join(assure_list(dsmeta.get('author', [])))
    info['keywords'] = '\t'.join(assure_list(dsmeta.get('tag', [])))
    # generate: s_study.txt
    study_df = _get_study_df(ds)
    if study_df.empty:
        # no samples, no assays, no metadataset
        return None

    _gather_protocol_parameters_from_df(study_df, protocols)
    _store_beautiful_table(study_df, output_directory, "s_study.txt")
    info['studytab_filename'] = 's_study.txt'

    deface_df = None
    # all imaging modalities recognized in BIDS
    #TODO maybe fold 'defacemask' into each modality as a derivative
    for modality in ('defacemask', 'T1w', 'T2w', 'T1map', 'T2map', 'FLAIR',
                     'FLASH', 'PD', 'PDmap', 'PDT2', 'inplaneT1', 'inplaneT2',
                     'angio', 'sbref', 'bold', 'SWImagandphase'):
        # what files do we have for this modality
        modfiles = _get_file_matches(metadb,
                                     '^sub-.*_{}\.nii\.gz$'.format(modality))
        if not len(modfiles):
            # not files found, try next
            lgr.info(
                "no files match MRI modality '{}', skipping".format(modality))
            continue

        df = _get_assay_df(metadb, modality, "Magnetic Resonance Imaging",
                           modfiles, _describe_file, repository_info)
        if df is None:
            continue
        if modality == 'defacemask':
            # rename columns to strip index
            df.columns = [c[6:] for c in df.columns]
            df.rename(columns={'Raw Data File': 'Derived Data File'},
                      inplace=True)
            df.drop(['Assay Name', 'Sample Name'] +
                    [c for c in df.columns if c.startswith('Factor')],
                    axis=1,
                    inplace=True)
            deface_df = df
            # re-prefix for merge logic compatibility below
            deface_df.columns = [
                _get_colkey(i, c) for i, c in enumerate(df.columns)
            ]
            # do not save separate, but include into the others as a derivative
            continue
        elif deface_df is not None:
            # get any factor columns, put last in final table
            factors = []
            # find where they stat
            for i, c in enumerate(df.columns):
                if '_Factor Value[' in c:
                    factors = df.columns[i:]
                    break
            factor_df = df[factors]
            df.drop(factors, axis=1, inplace=True)
            # merge relevant rows from deface df (hstack), by matching assay name
            df = df.join(deface_df, rsuffix='_deface')
            df.columns = [
                c[:-7] if c.endswith('_deface') else c for c in df.columns
            ]
            # cannot have overlapping columns, we removed the factor before
            df = df.join(factor_df)
        # rename columns to strip index
        df.columns = [c[6:] for c in df.columns]
        # parse df to gather protocol info
        _gather_protocol_parameters_from_df(df, protocols)
        # store
        assay_fname = "a_mri_{}.txt".format(modality.lower())
        _store_beautiful_table(df, output_directory, assay_fname)
        info['assay_fname'].append(assay_fname)
        info['assay_techtype'].append('nuclear magnetic resonance')
        info['assay_techtype_term'].append('OBI:0000182')
        info['assay_techtype_termsrc'].append('OBI')
        info['assay_measurementtype'].append('MRI Scanner')
        info['assay_measurementtype_term'].append('ERO:MRI_Scanner')
        info['assay_measurementtype_termsrc'].append('ERO')

    # non-MRI modalities
    for modlabel, assaylabel, protoref in (('physio', 'physio',
                                            "Physiological Measurement"),
                                           ('stim', 'stimulation',
                                            "Stimulation")):
        df = _get_assay_df(
            metadb, modlabel, protoref,
            _get_file_matches(metadb, '^sub-.*_{}.tsv.gz$'.format(modlabel)),
            _describe_file, repository_info)
        if df is None:
            continue
        # rename columns to strip index
        df.columns = [c[6:] for c in df.columns]
        assay_fname = "a_{}.txt".format(assaylabel)
        _store_beautiful_table(df, output_directory, assay_fname)
        info['assay_fname'].append(assay_fname)
        # ATM we cannot say anything definitive about these
        info['assay_techtype'].append('TODO')
        info['assay_techtype_term'].append('TODO')
        info['assay_techtype_termsrc'].append('TODO')
        info['assay_measurementtype'].append(assaylabel)
        info['assay_measurementtype_term'].append('TODO')
        info['assay_measurementtype_termsrc'].append('TODO')

    # post-proc assay-props for output
    for prop in assay_props:
        info[prop] = '\t'.join(assure_list(info[prop]))

    info['protocol_name'] = '\t'.join(protocols.keys())
    for k in ('type', 'term', 'termsrc'):
        info['protocol_{}'.format(k)] = '\t'.join(
            protocol_defs.get(p, {}).get(k, 'TODO') for p in protocols)
    info['protocol_parameters'] = '\t'.join('; '.join(sorted(protocols[p]))
                                            for p in protocols)
    return info