def load(fname, fixup=True, **kw): """Load JSON from a file, possibly fixing it up if initial load attempt fails Parameters ---------- fixup : bool In case of failed load, apply a set of fixups with hope to resolve issues in JSON **kw Passed into the load (and loads after fixups) function """ with io.open(fname, 'r', encoding='utf-8') as f: try: return jsonload(f, **kw) except JSONDecodeError as exc: if not fixup: raise lgr.warning( "Failed to decode content in %s: %s. Trying few tricks", fname, exc_str(exc)) # Load entire content and replace common "abusers" which break JSON comprehension but in general # are Ok with io.open(fname, 'r', encoding='utf-8') as f: s_orig = s = f.read() for o, r in { u"\xa0": " ", # non-breaking space }.items(): s = s.replace(o, r) if s == s_orig: # we have done nothing, so just reraise previous exception raise return loads(s, **kw)
def load(fname, fixup=True, **kw): """Load JSON from a file, possibly fixing it up if initial load attempt fails Parameters ---------- fixup : bool In case of failed load, apply a set of fixups with hope to resolve issues in JSON **kw Passed into the load (and loads after fixups) function """ with io.open(fname, 'r', encoding='utf-8') as f: try: return jsonload(f, **kw) except JSONDecodeError as exc: if not fixup: raise lgr.warning("Failed to decode content in %s: %s. Trying few tricks", fname, exc_str(exc)) # Load entire content and replace common "abusers" which break JSON # comprehension but in general # are Ok with io.open(fname, 'r', encoding='utf-8') as f: s_orig = s = f.read() for o, r in { u"\xa0": " ", # non-breaking space }.items(): s = s.replace(o, r) if s == s_orig: # we have done nothing, so just reraise previous exception raise return loads(s, **kw)
def load(fname, fixup=True, compressed=None, **kw): """Load JSON from a file, possibly fixing it up if initial load attempt fails Parameters ---------- fixup : bool In case of failed load, apply a set of fixups with hope to resolve issues in JSON compressed : bool or None Flag whether to treat the file as XZ compressed. If None, this decision is made automatically based on the presence of a '.xz' extension in the filename **kw Passed into the load (and loads after fixups) function """ _open = LZMAFile \ if compressed or compressed is None and fname.endswith('.xz') \ else io.open with _open(fname, 'rb') as f: try: jreader = codecs.getreader('utf-8')(f) return jsonload(jreader, **kw) except JSONDecodeError as exc: if not fixup: raise lgr.warning( "Failed to decode content in %s: %s. Trying few tricks", fname, exc_str(exc)) # Load entire content and replace common "abusers" which break JSON # comprehension but in general # are Ok with _open(fname, 'rb') as f: s_orig = s = codecs.getreader('utf-8')(f).read() for o, r in { u"\xa0": " ", # non-breaking space }.items(): s = s.replace(o, r) if s == s_orig: # we have done nothing, so just reraise previous exception raise return loads(s, **kw)
def load(fname, fixup=True, compressed=None, **kw): """Load JSON from a file, possibly fixing it up if initial load attempt fails Parameters ---------- fixup : bool In case of failed load, apply a set of fixups with hope to resolve issues in JSON compressed : bool or None Flag whether to treat the file as XZ compressed. If None, this decision is made automatically based on the presence of a '.xz' extension in the filename **kw Passed into the load (and loads after fixups) function """ _open = LZMAFile \ if compressed or compressed is None and fname.endswith('.xz') \ else io.open with _open(fname, 'rb') as f: try: jreader = codecs.getreader('utf-8')(f) return jsonload(jreader, **kw) except JSONDecodeError as exc: if not fixup: raise lgr.warning("Failed to decode content in %s: %s. Trying few tricks", fname, exc_str(exc)) # Load entire content and replace common "abusers" which break JSON # comprehension but in general # are Ok with _open(fname,'rb') as f: s_orig = s = codecs.getreader('utf-8')(f).read() for o, r in { u"\xa0": " ", # non-breaking space }.items(): s = s.replace(o, r) if s == s_orig: # we have done nothing, so just reraise previous exception raise return loads(s, **kw)
def __call__(query=None, dataset=None, force_reindex=False, max_nresults=20, show_keys=False, show_query=False): from whoosh import qparser as qparse try: ds = require_dataset(dataset, check_installed=True, purpose='dataset search') if ds.id is None: raise NoDatasetArgumentFound( "This does not seem to be a dataset (no DataLad dataset ID " "found). 'datalad create --force %s' can initialize " "this repository as a DataLad dataset" % ds.path) except NoDatasetArgumentFound: for r in _search_from_virgin_install(dataset, query): yield r return # where does the bunny have the eggs? index_dir = opj(ds.path, get_git_dir(ds.path), 'datalad', 'search_index') idx_obj = _get_search_index(index_dir, ds, force_reindex) if show_keys: definitions_fname = opj(index_dir, 'datalad_term_definitions.json.gz') try: defs = jsonload(gzopen(definitions_fname)) except Exception as e: lgr.warning( 'No term definitions found alongside search index: %s', exc_str(e)) defs = {} for k in idx_obj.schema.names(): print('{}{}'.format( k, ' {}'.format(defs[k] if isinstance(defs[k], dict) else '({})'.format(defs[k])) if k in defs else '')) return if not query: return with idx_obj.searcher() as searcher: # parse the query string, default whoosh parser ATM, could be # tailored with plugins parser = qparse.MultifieldParser(idx_obj.schema.names(), idx_obj.schema) # XXX: plugin is broken in Debian's whoosh 2.7.0-2, but already fixed # upstream parser.add_plugin(qparse.FuzzyTermPlugin()) parser.add_plugin(qparse.GtLtPlugin()) # replace field defintion to allow for colons to be part of a field's name: parser.replace_plugin( qparse.FieldsPlugin(expr=r"(?P<text>[()<>:\w]+|[*]):")) # for convenience we accept any number of args-words from the # shell and put them together to a single string here querystr = ' '.join(assure_list(query)) # this gives a formal whoosh query wquery = parser.parse(querystr) if show_query: print(wquery) return # perform the actual search hits = searcher.search( wquery, terms=True, limit=max_nresults if max_nresults > 0 else None) # cheap way to get an approximate number of hits, without an expensive # scoring of all items # disabled: unreliable estimate, often confusing #nhits = hits.estimated_min_length() # report query stats topstr = '{} top {}'.format( max_nresults, single_or_plural('match', 'matches', max_nresults)) lgr.info('Query completed in {} sec.{}'.format( hits.runtime, ' Reporting {}.'.format(( 'up to ' + topstr) if max_nresults > 0 else 'all matches') if not hits.is_empty() else ' No matches.')) if not hits: return nhits = 0 for hit in hits: res = dict( action='search', status='ok', logger=lgr, refds=ds.path, # normpath to avoid trailing dot path=normpath(opj(ds.path, hit['path'])), query_matched={ assure_unicode(k): assure_unicode(v) if isinstance( v, unicode_srctypes) else v for k, v in hit.matched_terms() }, metadata={ k: v for k, v in hit.fields().items() if k not in ('path', 'parentds') }) if 'parentds' in hit: res['parentds'] = normpath(opj(ds.path, hit['parentds'])) yield res nhits += 1 if max_nresults and nhits == max_nresults: lgr.info("Reached the limit of {}, there could be more which " "were not reported.".format(topstr))