def matcher(self, searcher, context=None): if self.text == ".*": from whoosh.query import Every eq = Every(self.fieldname, boost=self.boost) return eq.matcher(searcher, context) else: return PatternQuery.matcher(self, searcher, context)
def test_merge_ranges(): q = And([TermRange("f1", u("a"), None), TermRange("f1", None, u("z"))]) assert_equal(q.normalize(), TermRange("f1", u("a"), u("z"))) q = And([NumericRange("f1", None, u("aaaaa")), NumericRange("f1", u("zzzzz"), None)]) assert_equal(q.normalize(), q) q = And([TermRange("f1", u("a"), u("z")), TermRange("f1", "b", "x")]) assert_equal(q.normalize(), TermRange("f1", u("a"), u("z"))) q = And([TermRange("f1", u("a"), u("m")), TermRange("f1", u("f"), u("q"))]) assert_equal(q.normalize(), TermRange("f1", u("f"), u("m"))) q = Or([TermRange("f1", u("a"), u("m")), TermRange("f1", u("f"), u("q"))]) assert_equal(q.normalize(), TermRange("f1", u("a"), u("q"))) q = Or([TermRange("f1", u("m"), None), TermRange("f1", None, u("n"))]) assert_equal(q.normalize(), Every("f1")) q = And([Every("f1"), Term("f1", "a"), Variations("f1", "b")]) assert_equal(q.normalize(), Every("f1")) q = Or([Term("f1", u("q")), TermRange("f1", u("m"), None), TermRange("f1", None, u("n"))]) assert_equal(q.normalize(), Every("f1")) q = And([Or([Term("f1", u("a")), Term("f1", u("b"))]), Every("f1")]) assert_equal(q.normalize(), Every("f1")) q = And([Term("f1", u("a")), And([Or([Every("f1")])])]) assert_equal(q.normalize(), Every("f1"))
def build_keywords_query(keywords): """ Build parsers for a query. :param MultiDict keywords: The search texts keyed by scope key. If empty, the query will match every documents. """ queries = [] if keywords: composer = current_app.config['KERKO_COMPOSER'] text_plugins = [ plugins.PhrasePlugin(), plugins.GroupPlugin(), plugins.OperatorsPlugin( And=r"(?<=\s)" + re.escape(gettext("AND")) + r"(?=\s)", Or=r"(?<=\s)" + re.escape(gettext("OR")) + r"(?=\s)", Not=r"(^|(?<=(\s|[()])))" + re.escape(gettext("NOT")) + r"(?=\s)", AndNot=None, AndMaybe=None, Require=None ), plugins.BoostPlugin(), ] for key, value in keywords.items(multi=True): fields = [spec.key for spec in composer.fields.values() if key in spec.scopes] if not fields: raise KeyError # No known field for that scope key. parser = MultifieldParser( fields, schema=composer.schema, plugins=text_plugins ) queries.append(parser.parse(value)) else: queries.append(Every()) return And(queries)
def search(self, querytext, request, pagenum=1, maxresults=30): user_q = querytext and self.parser.parse(querytext) or Every() restricted_q = And([user_q, self.restrict_query(request)]) result = {} if pagenum < 1: pagenum = 1 with self.searcher() as searcher: hits = searcher.search(restricted_q, limit=(pagenum * maxresults) + 1) if querytext and hits.is_empty(): corrected = searcher.correct_query(user_q, querytext) if corrected.query != user_q: querytext = corrected.string result['corrected_q'] = querytext restricted_q = And( [corrected.query, self.restrict_query(request)]) hits = searcher.search(restricted_q, limit=(pagenum * maxresults)) self.prepare_search_response(result, hits, pagenum, maxresults) return result
def run(self, key, value, remove, query): if not ((key and value) or (key and remove)) or (key and value and remove): fatal("You need to either specify a proper key/value pair or " "only a key you want to delete (with -r set).") if not remove: try: value = literal_eval(value) except ValueError: fatal( "You need to specify a valid Python literal as the argument" ) if query: qp = app.storage.query_parser([ NAME_EXACT, ]) q = qp.parse(query) else: q = Every() for current_rev in app.storage.search(q, limit=None): name = current_rev.meta[NAME] newmeta = dict(current_rev.meta) if remove: newmeta.pop(key) print "Processing {0!r}, removing {1}.".format(name, key) else: newmeta[key] = value print "Processing {0!r}, setting {1}={2!r}.".format( name, key, value) current_rev.item.store_revision(newmeta, current_rev.data)
def create_urls(url_folder, apps): ix = open_dir("whoosh_file") result = list(ix.searcher().search(Every('table_name'))) f = open(url_folder + "/urls.py", "w") f.write( "from django.contrib import admin\nfrom django.urls import path\nfrom " + apps + ".views import *\n\nurlpatterns=[\npath('admin/', admin.site.urls),\n") for i in range(len(result)): d = ast.literal_eval(str(result[i])[5:][:-1]) if d['view_type'] == 'base': b = "ListView" c = "/" if d['view_type'] == 'list': b = "DetailView" c = "/<int:pk>/" if d['name'] == 'home': f.write("path('',HomeView.as_view(),name=\"" + d['name'] + "\"),\n") if d['name'] != 'home': f.write("path('" + d['name'] + c + "'," + d['table_name'] + d['name'] + b + ".as_view(),name=\"" + d['name'] + "\"),\n") f.write("]") f.close()
def dump(self): ix = self.storage.open_index() print(ix.schema) from whoosh.query import Every with ix.searcher() as searcher: for result in ix.searcher().search(Every('object_id')): print(result)
def run(self, query): before_wiki() if query: qp = app.storage.query_parser([ NAME_EXACT, ]) q = qp.parse(query_text) else: q = Every() for current_rev in app.storage.search(q, limit=None): current_name = current_rev.meta[NAME] current_revid = current_rev.meta[REVID] print "Destroying historical revisions of {0!r}:".format( current_name) has_historical_revision = False for rev in current_rev.item.iter_revs(): revid = rev.meta[REVID] if revid == current_revid: continue has_historical_revision = True name = rev.meta[NAME] if name == current_name: print " Destroying revision {0}".format(revid) else: print " Destroying revision {0} (named {1!r})".format( revid, name) current_rev.item.destroy_revision(revid) if not has_historical_revision: print " (no historical revisions)" print "Finished reducing backend."
def search(querytext, request, pagenum=1, maxresults=30, staff=False, scope=None, orderby='-creation_date'): search_engine = get_search_engine('resource') search_result = {} if pagenum < 1: pagenum = 1 with search_engine.searcher() as searcher: parser = MultifieldParser(search_engine.default_search_fields, searcher.schema) user_q = querytext and parser.parse(querytext) or Every() user_q, search_kwargs = build_search_kwargs(user_q, request, scope, staff, orderby) hits = searcher.search(user_q, limit=(pagenum * maxresults) + 1, **search_kwargs) if querytext and hits.is_empty(): correction_q = parser.parse(querytext) corrected = searcher.correct_query(correction_q, querytext) if corrected.query != correction_q: querytext = corrected.string search_result['corrected_q'] = querytext user_q, search_kwargs = build_search_kwargs(corrected.query, request, scope, staff, orderby) hits = searcher.search(user_q, limit=(pagenum * maxresults), **search_kwargs) search_engine.prepare_search_response(search_result, hits, pagenum, maxresults) search_result['results'] = add_other_versions(searcher, search_result['results'], request.user, staff) add_absolute_urls(search_result['results'], request) return search_result
def search_snippet(): query = request.args.get('query', None) page = int(request.args.get('page', 1)) if query: qp = MultifieldParser(["title", "content", "tag"], schema=index.schema) q = qp.parse(query) else: q = Every() response = {"results": [], "total": 0} with index.searcher() as searcher: results = searcher.search_page(q, page, pagelen=config.SEARCH_PAGINATION, sortedby="title") for snippet in results: response["results"].append({ 'id': snippet['id'], 'title': snippet['title'] }) response["total_snippets"] = len(results) response["total_pages"] = (len(results) - 1) / config.SEARCH_PAGINATION + 1 response["current_page"] = page return json.dumps(response) return json.dumps(response)
def build_filter_query(filters=None): """ Build groupedby and filter queries based on facet specs. :param list filter: A list of (name, values) tuples, where values is itself a list. :return: A tuple with the Facets to perform grouping on, and the terms to filter on. """ composer = current_app.config['KERKO_COMPOSER'] groupedby = Facets() for spec in composer.facets.values(): groupedby.add_field(spec.key, allow_overlap=spec.allow_overlap) terms = [] if filters: for filter_key, filter_values in filters: spec = composer.get_facet_by_filter_key(filter_key) if spec: # Ensure only valid filters. for v in filter_values: if v == '': # If trying to filter with a missing value. # Exclude all results with a value in facet field. terms.append(Not(Every(spec.key))) else: v = spec.codec.transform_for_query(v) terms.append(spec.query_class(spec.key, v)) return groupedby, And(terms)
def build_keywords_query(keywords): """ Build parsers for a query. :param MultiDict keywords: The search texts keyed by scope key. If empty, the query will match every documents. """ queries = [] if keywords: composer = current_app.config['KERKO_COMPOSER'] text_plugins = [PhrasePlugin(), GroupPlugin(), OperatorsPlugin()] for key, value in keywords.items(multi=True): fields = [ spec.key for spec in composer.fields.values() if key in spec.scopes ] if not fields: raise KeyError # No known field for that scope key. parser = MultifieldParser(fields, schema=composer.schema, plugins=text_plugins) queries.append(parser.parse(value)) else: queries.append(Every()) return And(queries)
def search_whoosh_index(query, offset=0, limit=10, *args, **kwargs): ix = get_whoosh_index() parser = MultifieldParser( ['content', 'authors', 'tags', 'title', 'abstract'], ix.schema) # user query q = parser.parse(query) if not query: q = Every() print 'arch' allow_q = And([Term(key, value) for key, value in kwargs.iteritems()]) # parse remaining args res = [] count = 0 offset = int(offset) limit = int(limit) right = offset + limit # restrict_q = Or([Term("path", u'%s' % d.id) for d in qs]) #print 'query', q, allow_q, kwargs with ix.searcher() as searcher: # From WHOOSH documentation: # > Currently, searching for page 100 with pagelen of 10 takes the same amount of time as using Searcher.search() # to find the first 1000 results results = searcher.search(q, filter=allow_q, limit=right, terms=True) count = len(results) for hit in list(results)[offset:]: res.append({ # 'title': hit['title'], 'short_url': hit['path'], 'highlights': hit.highlights("content", top=5) }) # @todo filter by empty highlight strings return {'results': res, 'count': count}
def build_search_facet_results(searcher, groups, criteria, facet_specs): """ Prepare facet results for the search page. """ facets = {} if groups: # Build facet results from groupings obtained with the search. for spec in facet_specs: facets[spec.key] = spec.build(groups(spec.key).items(), criteria) elif criteria.has_filter_search(): # No groupings available even though facets are used. This usually means # that the search itself had zero results, thus no facet results either. # But building facet results is still desirable in order to display the # active filters in the search interface. To get those, we perform a # separate query for each active filter, but this time ignoring any # other search criteria. for filter_key in criteria.filters.keys(): for spec in facet_specs: if filter_key == spec.filter_key: results = searcher.search( Every(), filter=build_filter_query( [tuple([spec.key, criteria.filters.getlist(spec.key)])] ), groupedby=build_groupedby_query([spec]), maptype=Count, # Not to be used, as other criteria are ignored. limit=1, # Don't care about the documents. ) facets[spec.key] = spec.build( results.groups(spec.key).items(), criteria, active_only=True ) return facets
def run_query_all(return_fields=None): """Perform a search query to return all items (without faceting).""" with open_index().searcher() as searcher: results = searcher.search(Every(), limit=None) if results: for hit in results: yield _get_fields(hit, return_fields) return []
def print_paragraph_headings_for_all_docs(self): """ DEBUGGING Gibt die Überschriften aller Absätze aller Dokumente des Index aus. """ self.ix = index.open_dir(self.index_path) with self.ix.searcher() as s: for hit in s.search(Every("parent_identifier"), limit=None): print("Title: " + hit["title"]) for child in s.documents(parent=hit["parent_identifier"]): print("\t" + str(child["paragraph_heading"]))
def get_all_x_values( self, filter_splits: Tuple[DataSplits, ...] = None ) -> Generator[XValue, None, None]: """Yields all examples in the index""" if filter_splits is None or len(filter_splits) == 0: query = Every() else: query = Or([Term("split", str(split.value)) for split in filter_splits]) yield from (self._dict_to_example(hit.doc) for hit in self.backend.query(query, max_results=None, score=False))
def print_index(index): with index.searcher() as searcher: results = searcher.search(Every(), limit=None) doc_ids = [r["doc_id"] for r in results] doc_ids.sort() print(f"Index for field 'content':") for word in searcher.lexicon("content"): print(word.decode("utf-8") + ": ", end="") for doc in searcher.postings("content", word).all_ids(): print(doc_ids[doc], end=" ") print()
def normalize(self): if self.start in ('', None) and self.end in (u('\uffff'), None): from whoosh.query import Every return Every(self.fieldname, boost=self.boost) elif self.start == self.end: if self.startexcl or self.endexcl: return qcore.NullQuery return terms.Term(self.fieldname, self.start, boost=self.boost) else: return TermRange(self.fieldname, self.start, self.end, self.startexcl, self.endexcl, boost=self.boost)
def run_query_all(return_fields=None, default_terms=None): """Perform a search query to return all items (without faceting).""" with open_index('index').searcher() as searcher: results = searcher.search( Every(), limit=None, filter=And(default_terms) if default_terms else None, ) if results: for hit in results: yield _get_fields(hit, return_fields) return []
def search(querytext, request, pagenum=1, maxresults=30, staff=False, scope=None, orderby='-creation_date'): search_engine = get_search_engine('resource') search_result = {} if pagenum < 1: pagenum = 1 with search_engine.searcher() as searcher: fieldnames = ['description', 'vendor', 'title', 'wiring'] query_p = QueryParser('content', searcher.schema) multif_p = MultifieldParser(fieldnames, searcher.schema) user_q = querytext and query_p.parse(querytext) or Every() user_q, search_kwargs = build_search_kwargs(user_q, request, scope, staff, orderby) hits = searcher.search(user_q, limit=(pagenum * maxresults) + 1, **search_kwargs) if querytext and hits.is_empty(): patch_expand_prefix(searcher) correction_q = multif_p.parse(querytext) corrected = searcher.correct_query(correction_q, querytext) if corrected.query != correction_q: querytext = corrected.string search_result['corrected_q'] = querytext user_q = query_p.parse(querytext) user_q, search_kwargs = build_search_kwargs( user_q, request, scope, staff, orderby) hits = searcher.search(user_q, limit=(pagenum * maxresults), **search_kwargs) search_engine.prepare_search_response(search_result, hits, pagenum, maxresults) search_result['results'] = add_other_versions(searcher, search_result['results'], request.user, staff) add_absolute_urls(search_result['results'], request) return search_result
def test_missing_wildcard(): schema = fields.Schema(id=fields.ID(stored=True), f1=fields.TEXT, f2=fields.TEXT) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(id=u("1"), f1=u("alfa"), f2=u("apple")) w.add_document(id=u("2"), f1=u("bravo")) w.add_document(id=u("3"), f1=u("charlie"), f2=u("candy")) w.add_document(id=u("4"), f2=u("donut")) w.add_document(id=u("5")) w.commit() with ix.searcher() as s: r = s.search(Every("id")) assert_equal(sorted([d['id'] for d in r]), ["1", "2", "3", "4", "5"]) r = s.search(Every("f1")) assert_equal(sorted([d['id'] for d in r]), ["1", "2", "3"]) r = s.search(Every("f2")) assert_equal(sorted([d['id'] for d in r]), ["1", "3", "4"])
def search(): print(request.args) search = request.args.get('search') author = request.args.get('author') category = request.args.get('category') page = int(request.args.get( 'page')) if not request.args.get('page') is None else 1 print(search) if search is None and author is None and category is None: myquery = Every() else: if search is None: if not author is None: myquery = Term('author', author) if not category is None: myquery = myquery & Term('category', category) else: myquery = Term('category', category) else: myquery = MultifieldParser(["title", "post_content"], ix.schema, plugins=[FuzzyTermPlugin() ]).parse(search) if not author is None: myquery = myquery & Term('author', author) if not category is None: myquery = myquery & Term('category', category) with ix.searcher() as searcher: results = searcher.search_page(myquery, page, pagelen=25, sortedby="date", reverse=True) print(results.is_last_page()) results_json = json.dumps( { "results": [dict(i) for i in results], "page": page, "total_results": results.total }, default=str) resp = Response(response=results_json, status=200, mimetype="application/json") return resp
def get(self): try: ix = open_dir("index") with ix.searcher() as searcher: results = searcher.search(Every(), limit=None) self.write( tornado.escape.json_encode([{ 'url': r.get('url'), 'hash': r.get('hash', 'blank') } for r in results])) self.set_header('Content-Type', 'application/json') except: self.write(tornado.escape.json_encode([{'status': 'ERROR'}])) self.set_header('Content-Type', 'application/json')
def index_info(self) -> str: """ Baut einen String, der Informationen über den Index enthält. Anzahl der Dokumente und durchschnittliche Anzahl an Absätzen pro Dokument. :return: Index Informationen als String """ self.ix = index.open_dir(self.index_path) doc_count = 0 searcher = self.ix.searcher() for hit in searcher.search(Every("parent_identifier"), limit=None): doc_count += 1 searcher.close() out_string = ("The Index contains {0} Documents\n".format(doc_count)) paragraph_count = 0 searcher = self.ix.searcher() for hit in searcher.search(Every("parent"), limit=None): paragraph_count += 1 searcher.close() out_string += ( "On Average every Document contains {0} Paragraphs".format( round(paragraph_count / doc_count, 2))) return out_string
def dump(self, options): for name in self.get_requested_index_names(): print 'Schemas - %s' % name dir_abs = os.path.join(settings.SEARCH_INDEX_PATH, name) print '\t%s' % dir_abs from whoosh.index import open_dir from whoosh.query import Every index = open_dir(dir_abs) q = Every() with index.searcher() as searcher: hits = searcher.search(q, limit=None) for hit in hits: print('\t%s' % repr(hit)).encode('ascii', 'ignore')
def test_produce_content(self): technique = SearchIndexing() index = IndexInterface('./search-index') index.init_writing() index.new_content() technique.produce_content('Plot', '0', 'this is a test for the search index', index) index.serialize_content() index.stop_writing() ix = open_dir('./search-index') with ix.searcher() as searcher: query = Every() results = searcher.search(query) self.assertEqual(results[0]['Plot0'], 'this is a test for the search index') index.delete_index()
def run(self, query): before_wiki() if query: q = And([ Term(WIKINAME, app.cfg.interwikiname), Regex(NAME_EXACT, query) ]) else: q = Every() for current_rev in app.storage.search(q, limit=None): current_name = current_rev.meta[NAME] current_revid = current_rev.meta[REVID] print "Destroying historical revisions of {0!r}:".format( current_name) has_historical_revision = False for rev in current_rev.item.iter_revs(): revid = rev.meta[REVID] if revid == current_revid: # fixup metadata and overwrite existing revision; modified time will be updated if changed changed = False meta = dict(rev.meta) if REV_NUMBER in meta and meta[ REV_NUMBER] > 1 or REV_NUMBER not in meta: changed = True meta[REV_NUMBER] = 1 if PARENTID in meta: changed = True del meta[PARENTID] if changed: current_rev.item.store_revision(meta, current_rev.data, overwrite=True) print " (current rev meta data updated)" continue has_historical_revision = True name = rev.meta[NAME] if name == current_name: print " Destroying revision {0}".format(revid) else: print " Destroying revision {0} (named {1!r})".format( revid, name) current_rev.item.destroy_revision(revid) if not has_historical_revision: print " (no historical revisions)" print "Finished reducing backend."
def whoosh_search(self, query, searcher, index, info): ret = '' # run a whoosh search and display the hits # query applies to all fields in the schema # special query: ALL, ANY # limit = int(self.options['limit'] or '1000000') if query in ['ALL', 'ANY']: from whoosh.query.qcore import Every q = Every() else: from whoosh.qparser import MultifieldParser, GtLtPlugin # TODO: only active columns term_fields = [item[0] for item in index.schema.items()] parser = MultifieldParser(term_fields, index.schema) parser.add_plugin(GtLtPlugin) q = parser.parse(u'%s' % query) if query in ['ANY']: limit = 1 afield = self.options['field'] res = searcher.search(q, limit=limit) vs = {} for hit in res: if afield: # display only the unique value in the requested field vs[hit[afield]] = vs.get(hit[afield], 0) + 1 else: # display all field, value in this record for k, v in hit.iteritems(): ret += '\t%-20s %s\n' % (k, repr(v)[0:30]) ret += '\t' + ('-' * 20) + '\n' if vs: for v, c in vs.iteritems(): ret += '\t%6s x %s\n' % (c, repr(v)) info['results'] = ret info['result_size'] = len(res) ret += '\n\n%s documents found' % len(res) return ret
def normalize(self): # If there are no wildcard characters in this "wildcard", turn it into # a simple Term text = self.text if text == "*": from whoosh.query import Every return Every(self.fieldname, boost=self.boost) if "*" not in text and "?" not in text: # If no wildcard chars, convert to a normal term. return Term(self.fieldname, self.text, boost=self.boost) elif ("?" not in text and text.endswith("*") and text.find("*") == len(text) - 1): # If the only wildcard char is an asterisk at the end, convert to a # Prefix query. return Prefix(self.fieldname, self.text[:-1], boost=self.boost) else: return self