def search(self): c.terms = request.GET.get('terms', '') c.results = [] if len(c.terms) < 4: h.flash( _('Search queries must be at least 4 characters in length.'), 'error' ) redirect(url(controller='blog', action='index')) query = MultifieldParser( ['title', 'content', 'summary'], schema=index.schema ).parse(c.terms) results = index.searcher().search(query, limit=10) for result in results: terms = [v for k, v in query.all_terms() if k == 'content'] url_kwargs = json.loads(result['url']) result['url'] = url(**url_kwargs) result['highlights'] = highlight( result['content'], terms, search.schema['content'].format.analyzer, ContextFragmenter(terms), HtmlFormatter(tagname='span', classname='highlight') ) c.results.append(result) return render('search.tpl', slacks=True)
def find(criteria, reindex=False): """ Search for Azure CLI commands :param str criteria: Query text to search for. :param bool reindex: Clear the current index and reindex the command modules. :return: :rtype: None """ if reindex: _create_index() ix = _get_index() qp = MultifieldParser( ['cmd_name', 'short_summary', 'long_summary', 'examples'], schema=schema ) if 'OR' in criteria or 'AND' in criteria: # looks more advanced, let's trust them to make a great query q = qp.parse(" ".join(criteria)) else: # let's help out with some OR's to provide a less restrictive search q = qp.parse(" OR ".join(criteria)) with ix.searcher() as searcher: results = searcher.search(q) results.fragmenter = ContextFragmenter(maxchars=300, surround=200) results.formatter = UppercaseFormatter() for hit in results: _print_hit(hit)
def find(cmd, criteria, reindex=False): from whoosh.qparser import MultifieldParser if reindex: _create_index(cmd.cli_ctx) try: ix = _get_index(cmd.cli_ctx) except ValueError: # got a pickle error because the index was written by a different python version # recreate the index and proceed _create_index(cmd.cli_ctx) ix = _get_index(cmd.cli_ctx) qp = MultifieldParser( ['cmd_name', 'short_summary', 'long_summary', 'examples'], schema=_get_schema()) if 'OR' in criteria or 'AND' in criteria: # looks more advanced, let's trust them to make a great query q = qp.parse(" ".join(criteria)) else: # let's help out with some OR's to provide a less restrictive search expanded_query = " OR ".join(criteria) + " OR '{}'".format(criteria) q = qp.parse(expanded_query) with ix.searcher() as searcher: from whoosh.highlight import UppercaseFormatter, ContextFragmenter results = searcher.search(q) results.fragmenter = ContextFragmenter(maxchars=300, surround=200) results.formatter = UppercaseFormatter() for hit in results: _print_hit(hit)
def make_fragmenter_and_analyzer(type=None, maxchars=None, surround=None): type = type or 'context' if type == 'context': return ContextFragmenter( maxchars=maxchars or 300, surround=surround or 60, ), None elif type == 'sentence': return SentenceFragmenter(maxchars=maxchars or 300), StandardAnalyzer( stoplist=None) return None, None
def search(): start_time = time.time() form = request.form qstr = request.args.get('q') page = int(request.args.get('p', "1")) parser = MultifieldParser(['title', 'content'], schema=ix.schema, group=OrGroup) #termclass=MyFuzzyTerm) query = parser.parse(qstr) notes = [] with ix.searcher() as searcher: corrected = searcher.correct_query(query, qstr) results = searcher.search(query) rel = results.estimated_length() if corrected.string.lower() != qstr.lower(): crs = searcher.search(corrected.query) if crs.estimated_length() > rel: notes.append("Did you mean: " + corrected.string) results = searcher.search_page(query, page, terms=True) my_cf = ContextFragmenter(maxchars=20, surround=30, charlimit=256) results.order = SCORE results.fragmenter = my_cf # results.formatter = HtmlFormatter() rsp = [{ "url": item["url"], "content": item.highlights("content"), "title": item["title"] } for item in results] # return json.dumps(rsp) # print(json.dumps(rsp)) if rel == 0: notes.append("Sorry, no result for your query") else: elapsed_time = time.time() - start_time notes.append("%d results found in %.2f seconds" % (rel, elapsed_time)) return render_template("result.html", result=rsp, query=qstr, notes=notes, nextpage=page + 1, urlquery=urllib.quote_plus(qstr))
def __init__(self, whoosh_index_dir='', stopwords_file='', cache_host='localhost', cache_port=6379, **kwargs): Engine.__init__(self, **kwargs) self.whoosh_index_dir = whoosh_index_dir if not self.whoosh_index_dir: raise EngineConnectionException(self.name, "'whoosh_index_dir=' keyword argument not specified") self.stopwords_file = stopwords_file if self.stopwords_file: self.stopwords = ListReader(self.stopwords_file) # Open the stopwords file, read into a ListReader else: raise EngineConnectionException(self.name, "'stopwords_file=' keyword argument not specified") self.scoring_model_identifier = 1 self.scoring_model = scoring.PL2(c=10.0) self.__verbose = False try: self.doc_index = open_dir(self.whoosh_index_dir) self.reader = self.doc_index.reader() self.parser = QueryParser('content', self.doc_index.schema) # By default, we use AND grouping. # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc... # Objects required for document snippet generation self.analyzer = self.doc_index.schema[self.parser.fieldname].analyzer self.fragmenter = ContextFragmenter(maxchars=200, surround=40) self.formatter = HtmlFormatter() except EmptyIndexError: message = "Could not open Whoosh index at '{0}'".format(self.whoosh_index_dir) raise EngineConnectionException(self.name, message) except OSError: message = "Could not open Whoosh index at '{0}' - directory does not exist".format(self.whoosh_index_dir) raise EngineConnectionException(self.name, message) # Attempt to connect to the specified Redis cache. self.cache = RedisConn(host=cache_host, port=cache_port) self.cache.connect()
def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None): from haystack import connections results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) if result_class is None: result_class = SearchResult facets = {} spelling_suggestion = None unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result[DJANGO_CT].split('.') additional_fields = {} model = haystack_get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = unified_index.get_index(model) string_key = str(key) if string_key in index.fields and hasattr( index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if index.fields[string_key].is_multivalued: if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split( ',') else: additional_fields[string_key] = index.fields[ string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del (additional_fields[DJANGO_CT]) del (additional_fields[DJANGO_ID]) if highlight: sa = StemmingAnalyzer() formatter = WhooshHtmlFormatter('em') terms = [token.text for token in sa(query_string)] whoosh_result = whoosh_highlight( additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(), formatter) additional_fields['highlighted'] = { self.content_field_name: [whoosh_result], } result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields) results.append(result) else: hits -= 1 if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion( spelling_query) else: spelling_suggestion = self.create_spelling_suggestion( query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, }
def highlight(self, text, words): fragmenter = ContextFragmenter() formatter = HtmlFormatter() analyzer = self.project_schema['text'].analyzer return highlight(text, words, analyzer, fragmenter, formatter, top=1)
def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None): from haystack import site results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) facets = {} spelling_suggestion = None indexed_models = site.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result['django_ct'].split('.') additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = site.get_index(model) string_key = str(key) if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if isinstance(index.fields[string_key], MultiValueField): if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split(',') else: additional_fields[string_key] = index.fields[string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del(additional_fields['django_ct']) del(additional_fields['django_id']) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [term.replace('*', '') for term in query_string.split()] additional_fields['highlighted'] = { self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())], } result = SearchResult(app_label, model_name, raw_result['django_id'], score, **additional_fields) results.append(result) else: hits -= 1 if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False): if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, }
def _process_results( self, raw_page, highlight=False, query_string="", spelling_query=None, result_class=None, facet_types=None, ): from haystack import connections results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) if result_class is None: result_class = SearchResult spelling_suggestion = None unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() facets = {} if facet_types: facets = { "fields": {}, "dates": {}, "queries": {}, } for facet_fieldname in raw_page.results.facet_names(): group = raw_page.results.groups(facet_fieldname) facet_type = facet_types[facet_fieldname] # Extract None item for later processing, if present. none_item = group.pop(None, None) lst = facets[facet_type][facet_fieldname] = sorted( group.items(), key=(lambda itm: (-itm[1], itm[0]))) if none_item is not None: # Inject None item back into the results. none_entry = (None, none_item) if not lst or lst[-1][1] >= none_item: lst.append(none_entry) else: for i, value in enumerate(lst): if value[1] < none_item: lst.insert(i, none_entry) break for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result[DJANGO_CT].split(".") additional_fields = {} model = haystack_get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = unified_index.get_index(model) string_key = str(key) if string_key in index.fields and hasattr( index.fields[string_key], "convert"): # Special-cased due to the nature of KEYWORD fields. if index.fields[string_key].is_multivalued: if value is None or len(value) == 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split( ",") else: additional_fields[string_key] = index.fields[ string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del additional_fields[DJANGO_CT] del additional_fields[DJANGO_ID] if highlight: sa = StemmingAnalyzer() formatter = WhooshHtmlFormatter("em") terms = [token.text for token in sa(query_string)] whoosh_result = whoosh_highlight( additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(), formatter, ) additional_fields["highlighted"] = { self.content_field_name: [whoosh_result] } result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields) results.append(result) else: hits -= 1 if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion( spelling_query) else: spelling_suggestion = self.create_spelling_suggestion( query_string) return { "results": results, "hits": hits, "facets": facets, "spelling_suggestion": spelling_suggestion, }
def fragment_tokens(self, text, tokens): return ContextFragmenter.fragment_tokens(self, text, wrap_tokens(tokens))
def __init__(self, whoosh_index_dir='', use_cache=True, cache_host='localhost', cache_port=6379, **kwargs): """ Constructor for the engine. """ Engine.__init__(self, **kwargs) self.whoosh_index_dir = whoosh_index_dir if not self.whoosh_index_dir: raise EngineConnectionException( self.name, "'whoosh_index_dir=' keyword argument not specified") # Only put PL2 in for now (for more, add the model parameter to the constructor to specify!) self.scoring_model_identifier = 1 self.scoring_model = scoring.PL2(c=10.0) try: self.doc_index = open_dir(self.whoosh_index_dir) self.reader = self.doc_index.reader() self.parser = QueryParser( 'content', self.doc_index.schema) # By default, we use AND grouping. # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc... # Objects required for document snippet generation self.analyzer = self.doc_index.schema[ self.parser.fieldname].analyzer self.fragmenter = ContextFragmenter(maxchars=200, surround=40) self.formatter = HtmlFormatter() except EmptyIndexError: message = "Could not open Whoosh index at '{0}'".format( self.whoosh_index_dir) raise EngineConnectionException(self.name, message) except OSError: message = "Could not open Whoosh index at '{0}' - directory does not exist".format( self.whoosh_index_dir) raise EngineConnectionException(self.name, message) self.use_cache = use_cache if self.use_cache: self.cache = RedisConn(host=cache_host, port=cache_port) self.cache.connect() self.page_cache_forward_look = 40 # How many additional pages to cache when required. self.page_cache_when = 4 # When the user is x pages away from the end of the page cache, cache more pages. self.page_cache_controller = PageCacheController( cache_host=self.cache.host, cache_port=self.cache.port, whoosh_index=self.doc_index, scoring_model_identifier=self.scoring_model_identifier, parser=self.parser, analyzer=self.analyzer, fragmenter=self.fragmenter, formatter=self.formatter, cache_forward_look=self.page_cache_forward_look)
def _process_results( self, raw_page, highlight=False, query_string="", spelling_query=None, result_class=None, ): from haystack import connections results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) if result_class is None: result_class = SearchResult spelling_suggestion = None unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() facets = {} if len(raw_page.results.facet_names()): facets = { 'fields': {}, 'dates': {}, 'queries': {}, } for facet_fieldname in raw_page.results.facet_names(): # split up the list and filter out None-names so we can # sort them in python3 without getting a type error facet_items = [] facet_none = [] for name, value in raw_page.results.groups(facet_fieldname).items(): if name is not None: facet_items.append((name, len(value))) else: facet_none.append((name, len(value))) facet_items.sort(key=operator.itemgetter(1, 0), reverse=True) facets['fields'][facet_fieldname] = facet_items + facet_none for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result[DJANGO_CT].split(".") additional_fields = {} model = haystack_get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = unified_index.get_index(model) string_key = str(key) if string_key in index.fields and hasattr( index.fields[string_key], "convert" ): # Special-cased due to the nature of KEYWORD fields. if index.fields[string_key].is_multivalued: if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split(",") else: additional_fields[string_key] = index.fields[ string_key ].convert(value) else: additional_fields[string_key] = self._to_python(value) del (additional_fields[DJANGO_CT]) del (additional_fields[DJANGO_ID]) if highlight: sa = StemmingAnalyzer() formatter = WhooshHtmlFormatter("em") terms = [token.text for token in sa(query_string)] whoosh_result = whoosh_highlight( additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(), formatter, ) additional_fields["highlighted"] = { self.content_field_name: [whoosh_result] } result = result_class( app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields ) results.append(result) else: hits -= 1 if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) return { "results": results, "hits": hits, "facets": facets, "spelling_suggestion": spelling_suggestion, }
if n > max_limit: n = max_limit ranked_results = sorted(doc_scores.iteritems(), key=operator.itemgetter(1), reverse=True) print "{0} {1} {2}".format(query_num, len(ranked_results), n) if n > 0: for rank in range(n): trec_docid = reader.stored_fields(ranked_results[rank][0])['docid'] score_formatted = "{0:.6f}".format(ranked_results[rank][1]) output_file.write("{0} Q0 {1} {2} {3} Exp{4}".format(query_num, trec_docid, (rank + 1), score_formatted, os.linesep)) content = reader.stored_fields(ranked_results[rank][0])['content'] if isinstance(whoosh_query, str): termz = [unicode(whoosh_query)] else: termz = [text for fieldname, text in whoosh_query.all_terms() if fieldname == 'content'] from whoosh.highlight import highlight analyzer = ix.schema['content'].analyzer fragmenter = ContextFragmenter() formatter = HtmlFormatter() #print highlight(content, termz, analyzer, fragmenter, formatter) ix.close() input_file.close() output_file.close()
def __init__(self, markwiki_home): self.index_dir = os.path.join(markwiki_home, 'search') self._context_fragmenter = ContextFragmenter(maxchars=300, surround=55) # Whoosh convention prefers 'ix' for index. self._ix = None
def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None): from haystack import connections results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) if result_class is None: result_class = SearchResult facets = {} spelling_suggestion = None unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result[DJANGO_CT].split('.') additional_fields = {} model = apps.get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = unified_index.get_index(model) string_key = str(key) # if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): # Patch this up to exclude BooleanField because it's broken # https://github.com/toastdriven/django-haystack/issues/382 if (string_key in index.fields and callable( getattr(index.fields[string_key], 'convert', None)) and not isinstance(index.fields[string_key], BooleanField)): # Special-cased due to the nature of KEYWORD fields. if index.fields[string_key].is_multivalued: if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split( ',') else: additional_fields[string_key] = index.fields[ string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del (additional_fields[DJANGO_CT]) del (additional_fields[DJANGO_ID]) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [ term.replace('*', '') for term in query_string.split() ] additional_fields['highlighted'] = { self.content_field_name: [ highlight( additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter()) ], } result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields) results.append(result) else: hits -= 1 if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion( spelling_query) else: spelling_suggestion = self.create_spelling_suggestion( query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, }
#INDEX SCHEMA DEFINITION SCHEMA = Schema(fileid=ID(unique=True), owner=TEXT(), repository=TEXT(stored=True), path=TEXT(stored=True), content=FieldType(format=Characters(), analyzer=ANALYZER, scorable=True, stored=True), modtime=STORED(), extension=TEXT(stored=True)) IDX_NAME = 'HG_INDEX' FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') FRAGMENTER = ContextFragmenter(200) CHGSETS_SCHEMA = Schema( raw_id=ID(unique=True, stored=True), date=NUMERIC(stored=True), last=BOOLEAN(), owner=TEXT(), repository=ID(unique=True, stored=True), author=TEXT(stored=True), message=FieldType(format=Characters(), analyzer=ANALYZER, scorable=True, stored=True), parents=TEXT(), added=TEXT(), removed=TEXT(),
def run_query(query, index): """ Queries the index for data with the given text query @param query The text query to perform on the indexed data @return A list of HTMl string snippets to return """ # Create a searcher object for this index searcher = index.searcher() # Create a query parser that will parse multiple fields of the documents field_boosts = {'content': 1.0, 'title': 3.0} query_parser = MultifieldParser(['content', 'title'], schema=index_schema, fieldboosts=field_boosts, group=OrGroup) # Build a query object from the query string query_object = query_parser.parse(query) # Build a spell checker in this index and add the "content" field to the spell checker spell_checker = SpellChecker(index.storage) spell_checker.add_field(index, 'content') spell_checker.add_field(index, 'title') # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results search_terms = [text for fieldname, text in query_object.all_terms()] # Remove terms that are too short for search_term in search_terms: if len(search_term) <= 3: search_terms.remove(search_term) # Perform the query itself search_results = searcher.search(query_object) # Get an analyzer for analyzing the content of each page for highlighting analyzer = index_schema['content'].format.analyzer # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts # by 'context' in the content fragmenter = ContextFragmenter(frozenset(search_terms)) # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to # highlight the results formatter = HtmlFormatter() # Iterate through the search results, highlighting and counting the results result_count = 0 results = [] for search_result in search_results: # Collect this search result results.append({ 'content': highlight(search_result['content'], search_terms, analyzer, fragmenter, formatter), 'url': search_result['url'], 'title': search_result['title'] }) result_count += 1 # Build a list of 'suggest' words using the spell checker suggestions = [] for term in search_terms: suggestions.append(spell_checker.suggest(term)) # Return the list of web pages along with the terms used in the search return results, search_terms, suggestions, result_count