Exemple #1
0
def _search_from_virgin_install(dataset, query):
    #
    # this is to be nice to newbies
    #
    exc_info = sys.exc_info()
    if dataset is None:
        if not ui.is_interactive:
            raise NoDatasetArgumentFound(
                "No DataLad dataset found. Specify a dataset to be "
                "searched, or run interactively to get assistance "
                "installing a queriable superdataset."
            )
        # none was provided so we could ask user whether he possibly wants
        # to install our beautiful mega-duper-super-dataset?
        # TODO: following logic could possibly benefit other actions.
        DEFAULT_DATASET_PATH = cfg.obtain('datalad.locations.default-dataset')
        if os.path.exists(DEFAULT_DATASET_PATH):
            default_ds = Dataset(DEFAULT_DATASET_PATH)
            if default_ds.is_installed():
                if ui.yesno(
                    title="No DataLad dataset found at current location",
                    text="Would you like to search the DataLad "
                         "superdataset at %r?"
                          % DEFAULT_DATASET_PATH):
                    pass
                else:
                    raise exc_info[1]
            else:
                raise NoDatasetArgumentFound(
                    "No DataLad dataset found at current location. "
                    "The DataLad superdataset location %r exists, "
                    "but does not contain an dataset."
                    % DEFAULT_DATASET_PATH)
        elif ui.yesno(
                title="No DataLad dataset found at current location",
                text="Would you like to install the DataLad "
                     "superdataset at %r?"
                     % DEFAULT_DATASET_PATH):
            from datalad.api import install
            default_ds = install(DEFAULT_DATASET_PATH, source='///')
            ui.message(
                "From now on you can refer to this dataset using the "
                "label '///'"
            )
        else:
            raise exc_info[1]

        lgr.info(
            "Performing search using DataLad superdataset %r",
            default_ds.path
        )
        for res in default_ds.search(query):
            yield res
        return
    else:
        raise  # this function is called within exception handling block
Exemple #2
0
    def __call__(query=None,
                 dataset=None,
                 force_reindex=False,
                 max_nresults=None,
                 mode=None,
                 full_record=False,
                 show_keys=None,
                 show_query=False):
        try:
            ds = require_dataset(dataset, check_installed=True, purpose='dataset search')
            if ds.id is None:
                raise NoDatasetArgumentFound(
                    "This does not seem to be a dataset (no DataLad dataset ID "
                    "found). 'datalad create --force %s' can initialize "
                    "this repository as a DataLad dataset" % ds.path)
        except NoDatasetArgumentFound:
            for r in _search_from_virgin_install(dataset, query):
                yield r
            return

        if mode is None:
            # let's get inspired by what the dataset/user think is
            # default
            mode = ds.config.obtain('datalad.search.default-mode')

        if mode == 'egrep':
            searcher = _EGrepSearch
        elif mode == 'egrepcs':
            searcher = _EGrepCSSearch
        elif mode == 'textblob':
            searcher = _BlobSearch
        elif mode == 'autofield':
            searcher = _AutofieldSearch
        else:
            raise ValueError(
                'unknown search mode "{}"'.format(mode))

        searcher = searcher(ds, force_reindex=force_reindex)

        if show_keys:
            searcher.show_keys(show_keys)
            return

        if not query:
            return

        if show_query:
            print(repr(searcher.get_query(query)))
            return

        nhits = 0
        for r in searcher(
                query,
                max_nresults=max_nresults,
                full_record=full_record):
            nhits += 1
            yield r
        if not nhits:
            lgr.info(searcher.get_nohits_msg() or 'no hits')
Exemple #3
0
def _search_from_virgin_install(dataset, query):
    #
    # this is to be nice to newbies
    #
    exc_info = sys.exc_info()
    if dataset is None:
        if not ui.is_interactive:
            raise NoDatasetArgumentFound(
                "No DataLad dataset found. Specify a dataset to be "
                "searched, or run interactively to get assistance "
                "installing a queriable superdataset.")
        # none was provided so we could ask user either he possibly wants
        # to install our beautiful mega-duper-super-dataset?
        # TODO: following logic could possibly benefit other actions.
        if os.path.exists(LOCAL_CENTRAL_PATH):
            central_ds = Dataset(LOCAL_CENTRAL_PATH)
            if central_ds.is_installed():
                if ui.yesno(
                        title="No DataLad dataset found at current location",
                        text="Would you like to search the DataLad "
                        "superdataset at %r?" % LOCAL_CENTRAL_PATH):
                    pass
                else:
                    reraise(*exc_info)
            else:
                raise NoDatasetArgumentFound(
                    "No DataLad dataset found at current location. "
                    "The DataLad superdataset location %r exists, "
                    "but does not contain an dataset." % LOCAL_CENTRAL_PATH)
        elif ui.yesno(title="No DataLad dataset found at current location",
                      text="Would you like to install the DataLad "
                      "superdataset at %r?" % LOCAL_CENTRAL_PATH):
            from datalad.api import install
            central_ds = install(LOCAL_CENTRAL_PATH, source='///')
            ui.message("From now on you can refer to this dataset using the "
                       "label '///'")
        else:
            reraise(*exc_info)

        lgr.info("Performing search using DataLad superdataset %r",
                 central_ds.path)
        for res in central_ds.search(query):
            yield res
        return
    else:
        raise
Exemple #4
0
def require_dataset(dataset, check_installed=True, purpose=None):
    """Helper function to resolve a dataset.

    This function tries to resolve a dataset given an input argument,
    or based on the process' working directory, if `None` is given.

    Parameters
    ----------
    dataset : None or path or Dataset
      Some value identifying a dataset or `None`. In the latter case
      a dataset will be searched based on the process working directory.
    check_installed : bool, optional
      If True, an optional check whether the resolved dataset is
      properly installed will be performed.
    purpose : str, optional
      This string will be inserted in error messages to make them more
      informative. The pattern is "... dataset for <STRING>".

    Returns
    -------
    Dataset
      Or raises an exception (InsufficientArgumentsError).
    """
    if dataset is not None and not isinstance(dataset, Dataset):
        dataset = Dataset(dataset)

    if dataset is None:  # possible scenario of cmdline calls
        dspath = get_dataset_root(getpwd())
        if not dspath:
            raise NoDatasetArgumentFound("No dataset found")
        dataset = Dataset(dspath)

    assert(dataset is not None)
    lgr.debug("Resolved dataset{0}: {1}".format(
        ' for {}'.format(purpose) if purpose else '',
        dataset))

    if check_installed and not dataset.is_installed():
        raise ValueError("No installed dataset found at "
                         "{0}.".format(dataset.path))

    return dataset
Exemple #5
0
    def __call__(match,
                 dataset=None,
                 search=None,
                 report=None,
                 report_matched=False,
                 format='custom',
                 regex=False):
        """
        Yields
        ------
        location : str
            (relative) path to the dataset
        report : dict
            fields which were requested by `report` option
        """

        lgr.debug("Initiating search for match=%r and dataset %r",
                  match, dataset)
        try:
            ds = require_dataset(dataset, check_installed=True, purpose='dataset search')
            if ds.id is None:
                raise NoDatasetArgumentFound(
                    "This does not seem to be a dataset (no DataLad dataset ID "
                    "found). 'datalad create --force %s' can initialize "
                    "this repository as a DataLad dataset" % ds.path)
        except NoDatasetArgumentFound:
            exc_info = sys.exc_info()
            if dataset is None:
                if not ui.is_interactive:
                    raise NoDatasetArgumentFound(
                        "No DataLad dataset found. Specify a dataset to be "
                        "searched, or run interactively to get assistance "
                        "installing a queriable superdataset."
                    )
                # none was provided so we could ask user either he possibly wants
                # to install our beautiful mega-duper-super-dataset?
                # TODO: following logic could possibly benefit other actions.
                if os.path.exists(LOCAL_CENTRAL_PATH):
                    central_ds = Dataset(LOCAL_CENTRAL_PATH)
                    if central_ds.is_installed():
                        if ui.yesno(
                            title="No DataLad dataset found at current location",
                            text="Would you like to search the DataLad "
                                 "superdataset at %r?"
                                  % LOCAL_CENTRAL_PATH):
                            pass
                        else:
                            reraise(*exc_info)
                    else:
                        raise NoDatasetArgumentFound(
                            "No DataLad dataset found at current location. "
                            "The DataLad superdataset location %r exists, "
                            "but does not contain an dataset."
                            % LOCAL_CENTRAL_PATH)
                elif ui.yesno(
                        title="No DataLad dataset found at current location",
                        text="Would you like to install the DataLad "
                             "superdataset at %r?"
                             % LOCAL_CENTRAL_PATH):
                    from datalad.api import install
                    central_ds = install(LOCAL_CENTRAL_PATH, source='///')
                    ui.message(
                        "From now on you can refer to this dataset using the "
                        "label '///'"
                    )
                else:
                    reraise(*exc_info)

                lgr.info(
                    "Performing search using DataLad superdataset %r",
                    central_ds.path
                )
                for res in central_ds.search(
                        match,
                        search=search, report=report,
                        report_matched=report_matched,
                        format=format, regex=regex):
                    yield res
                return
            else:
                raise

        cache_dir = opj(opj(ds.path, get_git_dir(ds.path)), 'datalad', 'cache')
        mcache_fname = opj(cache_dir, 'metadata.p%d' % pickle.HIGHEST_PROTOCOL)

        meta = None
        if os.path.exists(mcache_fname):
            lgr.debug("use cached metadata of '{}' from {}".format(ds, mcache_fname))
            meta, checksum = pickle.load(open(mcache_fname, 'rb'))
            # TODO add more sophisticated tests to decide when the cache is no longer valid
            if checksum != ds.repo.get_hexsha():
                # errrr, try again below
                meta = None

        # don't put in 'else', as yet to be written tests above might fail and require
        # regenerating meta data
        if meta is None:
            lgr.info("Loading and caching local meta-data... might take a few seconds")
            if not exists(cache_dir):
                os.makedirs(cache_dir)

            meta = get_metadata(ds, guess_type=False, ignore_subdatasets=False,
                                ignore_cache=False)
            # merge all info on datasets into a single dict per dataset
            meta = flatten_metadata_graph(meta)
            # extract graph, if any
            meta = meta.get('@graph', meta)
            # build simple queriable representation
            if not isinstance(meta, list):
                meta = [meta]

            # sort entries by location (if present)
            sort_keys = ('location', 'description', 'id')
            # note with str() instead of '%' getting encoding issues...
            meta = sorted(meta, key=lambda m: tuple("%s" % (m.get(x, ""),) for x in sort_keys))

            # use pickle to store the optimized graph in the cache
            pickle.dump(
                # graph plus checksum from what it was built
                (meta, ds.repo.get_hexsha()),
                open(mcache_fname, 'wb'))
            lgr.debug("cached meta data graph of '{}' in {}".format(ds, mcache_fname))

        if report in ('', ['']):
            report = []
        elif report and not isinstance(report, list):
            report = [report]

        match = assure_list(match)
        search = assure_list(search)
        # convert all to lower case for case insensitive matching
        search = {x.lower() for x in search}

        def get_in_matcher(m):
            """Function generator to provide closure for a specific value of m"""
            mlower = m.lower()

            def matcher(s):
                return mlower in s.lower()
            return matcher

        matchers = [
            re.compile(match_).search
            if regex
            else get_in_matcher(match_)
            for match_ in match
        ]

        # location should be reported relative to current location
        # We will assume that noone chpwd while we are yielding
        ds_path_prefix = get_path_prefix(ds.path)

        # So we could provide a useful message whenever there were not a single
        # dataset with specified `--search` properties
        observed_properties = set()

        # for every meta data set
        for mds in meta:
            hit = False
            hits = [False] * len(matchers)
            matched_fields = set()
            if not mds.get('type', mds.get('schema:type', None)) == 'Dataset':
                # we are presently only dealing with datasets
                continue
            # TODO consider the possibility of nested and context/graph dicts
            # but so far we were trying to build simple lists of dicts, as much
            # as possible
            if not isinstance(mds, dict):
                raise NotImplementedError("nested meta data is not yet supported")

            # manual loop for now
            for k, v in iteritems(mds):
                if search:
                    k_lower = k.lower()
                    if k_lower not in search:
                        if observed_properties is not None:
                            # record for providing a hint later
                            observed_properties.add(k_lower)
                        continue
                    # so we have a hit, no need to track
                    observed_properties = None
                if isinstance(v, dict) or isinstance(v, list):
                    v = text_type(v)
                for imatcher, matcher in enumerate(matchers):
                    if matcher(v):
                        hits[imatcher] = True
                        matched_fields.add(k)
                if all(hits):
                    hit = True
                    # no need to do it longer than necessary
                    if not report_matched:
                        break

            if hit:
                location = mds.get('location', '.')
                report_ = matched_fields.union(report if report else {}) \
                    if report_matched else report
                if report_ == ['*']:
                    report_dict = mds
                elif report_:
                    report_dict = {k: mds[k] for k in report_ if k in mds}
                    if report_ and not report_dict:
                        lgr.debug(
                            'meta data match for %s, but no to-be-reported '
                            'properties (%s) found. Present properties: %s',
                            location, ", ".join(report_), ", ".join(sorted(mds))
                        )
                else:
                    report_dict = {}  # it was empty but not None -- asked to
                    # not report any specific field
                if isinstance(location, (list, tuple)):
                    # could be that the same dataset installed into multiple
                    # locations. For now report them separately
                    for l in location:
                        yield opj(ds_path_prefix, l), report_dict
                else:
                    yield opj(ds_path_prefix, location), report_dict

        if search and observed_properties is not None:
            import difflib
            suggestions = {
                s: difflib.get_close_matches(s, observed_properties)
                for s in search
            }
            suggestions_str = "\n ".join(
                "%s for %s" % (", ".join(choices), s)
                for s, choices in iteritems(suggestions) if choices
            )
            lgr.warning(
                "Found no properties which matched one of the one you "
                "specified (%s).  May be you meant one among: %s.\n"
                "Suggestions:\n"
                " %s",
                ", ".join(search),
                ", ".join(observed_properties),
                suggestions_str if suggestions_str.strip() else "none"
            )
Exemple #6
0
    def __call__(query=None,
                 dataset=None,
                 force_reindex=False,
                 max_nresults=20,
                 show_keys=False,
                 show_query=False):
        from whoosh import qparser as qparse

        try:
            ds = require_dataset(dataset,
                                 check_installed=True,
                                 purpose='dataset search')
            if ds.id is None:
                raise NoDatasetArgumentFound(
                    "This does not seem to be a dataset (no DataLad dataset ID "
                    "found). 'datalad create --force %s' can initialize "
                    "this repository as a DataLad dataset" % ds.path)
        except NoDatasetArgumentFound:
            for r in _search_from_virgin_install(dataset, query):
                yield r
            return

        # where does the bunny have the eggs?
        index_dir = opj(ds.path, get_git_dir(ds.path), 'datalad',
                        'search_index')

        idx_obj = _get_search_index(index_dir, ds, force_reindex)

        if show_keys:
            definitions_fname = opj(index_dir,
                                    'datalad_term_definitions.json.gz')
            try:
                defs = jsonload(gzopen(definitions_fname))
            except Exception as e:
                lgr.warning(
                    'No term definitions found alongside search index: %s',
                    exc_str(e))
                defs = {}

            for k in idx_obj.schema.names():
                print('{}{}'.format(
                    k,
                    ' {}'.format(defs[k] if isinstance(defs[k], dict) else
                                 '({})'.format(defs[k])) if k in defs else ''))
            return

        if not query:
            return

        with idx_obj.searcher() as searcher:
            # parse the query string, default whoosh parser ATM, could be
            # tailored with plugins
            parser = qparse.MultifieldParser(idx_obj.schema.names(),
                                             idx_obj.schema)
            # XXX: plugin is broken in Debian's whoosh 2.7.0-2, but already fixed
            # upstream
            parser.add_plugin(qparse.FuzzyTermPlugin())
            parser.add_plugin(qparse.GtLtPlugin())
            # replace field defintion to allow for colons to be part of a field's name:
            parser.replace_plugin(
                qparse.FieldsPlugin(expr=r"(?P<text>[()<>:\w]+|[*]):"))
            # for convenience we accept any number of args-words from the
            # shell and put them together to a single string here
            querystr = ' '.join(assure_list(query))
            # this gives a formal whoosh query
            wquery = parser.parse(querystr)

            if show_query:
                print(wquery)
                return
            # perform the actual search
            hits = searcher.search(
                wquery,
                terms=True,
                limit=max_nresults if max_nresults > 0 else None)
            # cheap way to get an approximate number of hits, without an expensive
            # scoring of all items
            # disabled: unreliable estimate, often confusing
            #nhits = hits.estimated_min_length()
            # report query stats
            topstr = '{} top {}'.format(
                max_nresults, single_or_plural('match', 'matches',
                                               max_nresults))
            lgr.info('Query completed in {} sec.{}'.format(
                hits.runtime, ' Reporting {}.'.format((
                    'up to ' + topstr) if max_nresults > 0 else 'all matches')
                if not hits.is_empty() else ' No matches.'))

            if not hits:
                return

            nhits = 0
            for hit in hits:
                res = dict(
                    action='search',
                    status='ok',
                    logger=lgr,
                    refds=ds.path,
                    # normpath to avoid trailing dot
                    path=normpath(opj(ds.path, hit['path'])),
                    query_matched={
                        assure_unicode(k): assure_unicode(v) if isinstance(
                            v, unicode_srctypes) else v
                        for k, v in hit.matched_terms()
                    },
                    metadata={
                        k: v
                        for k, v in hit.fields().items()
                        if k not in ('path', 'parentds')
                    })
                if 'parentds' in hit:
                    res['parentds'] = normpath(opj(ds.path, hit['parentds']))
                yield res
                nhits += 1

            if max_nresults and nhits == max_nresults:
                lgr.info("Reached the limit of {}, there could be more which "
                         "were not reported.".format(topstr))