Beispiel #1
0
def load(fname, fixup=True, **kw):
    """Load JSON from a file, possibly fixing it up if initial load attempt fails

    Parameters
    ----------
    fixup : bool
      In case of failed load, apply a set of fixups with hope to resolve issues
      in JSON
    **kw
      Passed into the load (and loads after fixups) function
    """
    with io.open(fname, 'r', encoding='utf-8') as f:
        try:
            return jsonload(f, **kw)
        except JSONDecodeError as exc:
            if not fixup:
                raise
            lgr.warning(
                "Failed to decode content in %s: %s. Trying few tricks", fname,
                exc_str(exc))

    # Load entire content and replace common "abusers" which break JSON comprehension but in general
    # are Ok
    with io.open(fname, 'r', encoding='utf-8') as f:
        s_orig = s = f.read()

    for o, r in {
            u"\xa0": " ",  # non-breaking space
    }.items():
        s = s.replace(o, r)

    if s == s_orig:
        # we have done nothing, so just reraise previous exception
        raise
    return loads(s, **kw)
Beispiel #2
0
def load(fname, fixup=True, **kw):
    """Load JSON from a file, possibly fixing it up if initial load attempt fails

    Parameters
    ----------
    fixup : bool
      In case of failed load, apply a set of fixups with hope to resolve issues
      in JSON
    **kw
      Passed into the load (and loads after fixups) function
    """
    with io.open(fname, 'r', encoding='utf-8') as f:
        try:
            return jsonload(f, **kw)
        except JSONDecodeError as exc:
            if not fixup:
                raise
            lgr.warning("Failed to decode content in %s: %s. Trying few tricks", fname, exc_str(exc))

            # Load entire content and replace common "abusers" which break JSON
            # comprehension but in general
            # are Ok
            with io.open(fname, 'r', encoding='utf-8') as f:
                s_orig = s = f.read()

            for o, r in {
                u"\xa0": " ",  # non-breaking space
            }.items():
                s = s.replace(o, r)

            if s == s_orig:
                # we have done nothing, so just reraise previous exception
                raise
            return loads(s, **kw)
Beispiel #3
0
def load(fname, fixup=True, compressed=None, **kw):
    """Load JSON from a file, possibly fixing it up if initial load attempt fails

    Parameters
    ----------
    fixup : bool
      In case of failed load, apply a set of fixups with hope to resolve issues
      in JSON
    compressed : bool or None
      Flag whether to treat the file as XZ compressed. If None, this decision
      is made automatically based on the presence of a '.xz' extension in the
      filename
    **kw
      Passed into the load (and loads after fixups) function
    """
    _open = LZMAFile \
        if compressed or compressed is None and fname.endswith('.xz') \
        else io.open

    with _open(fname, 'rb') as f:
        try:
            jreader = codecs.getreader('utf-8')(f)
            return jsonload(jreader, **kw)
        except JSONDecodeError as exc:
            if not fixup:
                raise
            lgr.warning(
                "Failed to decode content in %s: %s. Trying few tricks", fname,
                exc_str(exc))

            # Load entire content and replace common "abusers" which break JSON
            # comprehension but in general
            # are Ok
            with _open(fname, 'rb') as f:
                s_orig = s = codecs.getreader('utf-8')(f).read()

            for o, r in {
                    u"\xa0": " ",  # non-breaking space
            }.items():
                s = s.replace(o, r)

            if s == s_orig:
                # we have done nothing, so just reraise previous exception
                raise
            return loads(s, **kw)
Beispiel #4
0
def load(fname, fixup=True, compressed=None, **kw):
    """Load JSON from a file, possibly fixing it up if initial load attempt fails

    Parameters
    ----------
    fixup : bool
      In case of failed load, apply a set of fixups with hope to resolve issues
      in JSON
    compressed : bool or None
      Flag whether to treat the file as XZ compressed. If None, this decision
      is made automatically based on the presence of a '.xz' extension in the
      filename
    **kw
      Passed into the load (and loads after fixups) function
    """
    _open = LZMAFile \
        if compressed or compressed is None and fname.endswith('.xz') \
        else io.open

    with _open(fname, 'rb') as f:
        try:
            jreader = codecs.getreader('utf-8')(f)
            return jsonload(jreader, **kw)
        except JSONDecodeError as exc:
            if not fixup:
                raise
            lgr.warning("Failed to decode content in %s: %s. Trying few tricks", fname, exc_str(exc))

            # Load entire content and replace common "abusers" which break JSON
            # comprehension but in general
            # are Ok
            with _open(fname,'rb') as f:
                s_orig = s = codecs.getreader('utf-8')(f).read()

            for o, r in {
                u"\xa0": " ",  # non-breaking space
            }.items():
                s = s.replace(o, r)

            if s == s_orig:
                # we have done nothing, so just reraise previous exception
                raise
            return loads(s, **kw)
Beispiel #5
0
    def __call__(query=None,
                 dataset=None,
                 force_reindex=False,
                 max_nresults=20,
                 show_keys=False,
                 show_query=False):
        from whoosh import qparser as qparse

        try:
            ds = require_dataset(dataset,
                                 check_installed=True,
                                 purpose='dataset search')
            if ds.id is None:
                raise NoDatasetArgumentFound(
                    "This does not seem to be a dataset (no DataLad dataset ID "
                    "found). 'datalad create --force %s' can initialize "
                    "this repository as a DataLad dataset" % ds.path)
        except NoDatasetArgumentFound:
            for r in _search_from_virgin_install(dataset, query):
                yield r
            return

        # where does the bunny have the eggs?
        index_dir = opj(ds.path, get_git_dir(ds.path), 'datalad',
                        'search_index')

        idx_obj = _get_search_index(index_dir, ds, force_reindex)

        if show_keys:
            definitions_fname = opj(index_dir,
                                    'datalad_term_definitions.json.gz')
            try:
                defs = jsonload(gzopen(definitions_fname))
            except Exception as e:
                lgr.warning(
                    'No term definitions found alongside search index: %s',
                    exc_str(e))
                defs = {}

            for k in idx_obj.schema.names():
                print('{}{}'.format(
                    k,
                    ' {}'.format(defs[k] if isinstance(defs[k], dict) else
                                 '({})'.format(defs[k])) if k in defs else ''))
            return

        if not query:
            return

        with idx_obj.searcher() as searcher:
            # parse the query string, default whoosh parser ATM, could be
            # tailored with plugins
            parser = qparse.MultifieldParser(idx_obj.schema.names(),
                                             idx_obj.schema)
            # XXX: plugin is broken in Debian's whoosh 2.7.0-2, but already fixed
            # upstream
            parser.add_plugin(qparse.FuzzyTermPlugin())
            parser.add_plugin(qparse.GtLtPlugin())
            # replace field defintion to allow for colons to be part of a field's name:
            parser.replace_plugin(
                qparse.FieldsPlugin(expr=r"(?P<text>[()<>:\w]+|[*]):"))
            # for convenience we accept any number of args-words from the
            # shell and put them together to a single string here
            querystr = ' '.join(assure_list(query))
            # this gives a formal whoosh query
            wquery = parser.parse(querystr)

            if show_query:
                print(wquery)
                return
            # perform the actual search
            hits = searcher.search(
                wquery,
                terms=True,
                limit=max_nresults if max_nresults > 0 else None)
            # cheap way to get an approximate number of hits, without an expensive
            # scoring of all items
            # disabled: unreliable estimate, often confusing
            #nhits = hits.estimated_min_length()
            # report query stats
            topstr = '{} top {}'.format(
                max_nresults, single_or_plural('match', 'matches',
                                               max_nresults))
            lgr.info('Query completed in {} sec.{}'.format(
                hits.runtime, ' Reporting {}.'.format((
                    'up to ' + topstr) if max_nresults > 0 else 'all matches')
                if not hits.is_empty() else ' No matches.'))

            if not hits:
                return

            nhits = 0
            for hit in hits:
                res = dict(
                    action='search',
                    status='ok',
                    logger=lgr,
                    refds=ds.path,
                    # normpath to avoid trailing dot
                    path=normpath(opj(ds.path, hit['path'])),
                    query_matched={
                        assure_unicode(k): assure_unicode(v) if isinstance(
                            v, unicode_srctypes) else v
                        for k, v in hit.matched_terms()
                    },
                    metadata={
                        k: v
                        for k, v in hit.fields().items()
                        if k not in ('path', 'parentds')
                    })
                if 'parentds' in hit:
                    res['parentds'] = normpath(opj(ds.path, hit['parentds']))
                yield res
                nhits += 1

            if max_nresults and nhits == max_nresults:
                lgr.info("Reached the limit of {}, there could be more which "
                         "were not reported.".format(topstr))