Beispiel #1
0
    def search(self):
        c.terms = request.GET.get('terms', '')
        c.results = []
        if len(c.terms) < 4:
            h.flash(
                _('Search queries must be at least 4 characters in length.'),
                'error'
            )
            redirect(url(controller='blog', action='index'))

        query = MultifieldParser(
            ['title', 'content', 'summary'],
            schema=index.schema
        ).parse(c.terms)
        results = index.searcher().search(query, limit=10)
        for result in results:
            terms = [v for k, v in query.all_terms() if k == 'content']
            url_kwargs = json.loads(result['url'])
            result['url'] = url(**url_kwargs)
            result['highlights'] = highlight(
                result['content'],
                terms,
                search.schema['content'].format.analyzer,
                ContextFragmenter(terms),
                HtmlFormatter(tagname='span', classname='highlight')
            )
            c.results.append(result)
        return render('search.tpl', slacks=True)
Beispiel #2
0
def find(criteria, reindex=False):
    """
    Search for Azure CLI commands
    :param str criteria: Query text to search for.
    :param bool reindex: Clear the current index and reindex the command modules.
    :return:
    :rtype: None
    """
    if reindex:
        _create_index()

    ix = _get_index()
    qp = MultifieldParser(
        ['cmd_name', 'short_summary', 'long_summary', 'examples'],
        schema=schema
    )

    if 'OR' in criteria or 'AND' in criteria:
        # looks more advanced, let's trust them to make a great query
        q = qp.parse(" ".join(criteria))
    else:
        # let's help out with some OR's to provide a less restrictive search
        q = qp.parse(" OR ".join(criteria))

    with ix.searcher() as searcher:
        results = searcher.search(q)
        results.fragmenter = ContextFragmenter(maxchars=300, surround=200)
        results.formatter = UppercaseFormatter()
        for hit in results:
            _print_hit(hit)
Beispiel #3
0
def find(cmd, criteria, reindex=False):
    from whoosh.qparser import MultifieldParser
    if reindex:
        _create_index(cmd.cli_ctx)

    try:
        ix = _get_index(cmd.cli_ctx)
    except ValueError:
        # got a pickle error because the index was written by a different python version
        # recreate the index and proceed
        _create_index(cmd.cli_ctx)
        ix = _get_index(cmd.cli_ctx)

    qp = MultifieldParser(
        ['cmd_name', 'short_summary', 'long_summary', 'examples'],
        schema=_get_schema())

    if 'OR' in criteria or 'AND' in criteria:
        # looks more advanced, let's trust them to make a great query
        q = qp.parse(" ".join(criteria))
    else:
        # let's help out with some OR's to provide a less restrictive search
        expanded_query = " OR ".join(criteria) + " OR '{}'".format(criteria)
        q = qp.parse(expanded_query)

    with ix.searcher() as searcher:
        from whoosh.highlight import UppercaseFormatter, ContextFragmenter
        results = searcher.search(q)
        results.fragmenter = ContextFragmenter(maxchars=300, surround=200)
        results.formatter = UppercaseFormatter()
        for hit in results:
            _print_hit(hit)
Beispiel #4
0
def make_fragmenter_and_analyzer(type=None, maxchars=None, surround=None):
    type = type or 'context'
    if type == 'context':
        return ContextFragmenter(
            maxchars=maxchars or 300,
            surround=surround or 60,
        ), None
    elif type == 'sentence':
        return SentenceFragmenter(maxchars=maxchars or 300), StandardAnalyzer(
            stoplist=None)
    return None, None
Beispiel #5
0
def search():
    start_time = time.time()
    form = request.form
    qstr = request.args.get('q')
    page = int(request.args.get('p', "1"))
    parser = MultifieldParser(['title', 'content'],
                              schema=ix.schema,
                              group=OrGroup)
    #termclass=MyFuzzyTerm)
    query = parser.parse(qstr)
    notes = []
    with ix.searcher() as searcher:
        corrected = searcher.correct_query(query, qstr)
        results = searcher.search(query)
        rel = results.estimated_length()
        if corrected.string.lower() != qstr.lower():
            crs = searcher.search(corrected.query)
            if crs.estimated_length() > rel:
                notes.append("Did you mean: " + corrected.string)
        results = searcher.search_page(query, page, terms=True)
        my_cf = ContextFragmenter(maxchars=20, surround=30, charlimit=256)
        results.order = SCORE
        results.fragmenter = my_cf
        # results.formatter = HtmlFormatter()
        rsp = [{
            "url": item["url"],
            "content": item.highlights("content"),
            "title": item["title"]
        } for item in results]
    # return json.dumps(rsp)
    # print(json.dumps(rsp))
    if rel == 0:
        notes.append("Sorry, no result for your query")
    else:
        elapsed_time = time.time() - start_time
        notes.append("%d results found in %.2f seconds" % (rel, elapsed_time))
    return render_template("result.html",
                           result=rsp,
                           query=qstr,
                           notes=notes,
                           nextpage=page + 1,
                           urlquery=urllib.quote_plus(qstr))
Beispiel #6
0
    def __init__(self, whoosh_index_dir='', stopwords_file='', cache_host='localhost', cache_port=6379, **kwargs):
        Engine.__init__(self, **kwargs)

        self.whoosh_index_dir = whoosh_index_dir
        if not self.whoosh_index_dir:
            raise EngineConnectionException(self.name, "'whoosh_index_dir=' keyword argument not specified")

        self.stopwords_file = stopwords_file
        if self.stopwords_file:
            self.stopwords = ListReader(self.stopwords_file)  # Open the stopwords file, read into a ListReader
        else:
            raise EngineConnectionException(self.name, "'stopwords_file=' keyword argument not specified")

        self.scoring_model_identifier = 1
        self.scoring_model = scoring.PL2(c=10.0)
        
        self.__verbose = False

        try:
            self.doc_index = open_dir(self.whoosh_index_dir)
            self.reader = self.doc_index.reader()
            self.parser = QueryParser('content', self.doc_index.schema)  # By default, we use AND grouping.
                                                                         # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc...

            #  Objects required for document snippet generation
            self.analyzer = self.doc_index.schema[self.parser.fieldname].analyzer
            self.fragmenter = ContextFragmenter(maxchars=200, surround=40)
            self.formatter = HtmlFormatter()
        except EmptyIndexError:
            message = "Could not open Whoosh index at '{0}'".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)
        except OSError:
            message = "Could not open Whoosh index at '{0}' - directory does not exist".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)

        # Attempt to connect to the specified Redis cache.
        self.cache = RedisConn(host=cache_host, port=cache_port)
        self.cache.connect()
Beispiel #7
0
    def _process_results(self,
                         raw_page,
                         highlight=False,
                         query_string='',
                         spelling_query=None,
                         result_class=None):
        from haystack import connections
        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)

        if result_class is None:
            result_class = SearchResult

        facets = {}
        spelling_suggestion = None
        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = haystack_get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = unified_index.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(
                            index.fields[string_key], 'convert'):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) is 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(
                                    ',')
                        else:
                            additional_fields[string_key] = index.fields[
                                string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])

                if highlight:
                    sa = StemmingAnalyzer()
                    formatter = WhooshHtmlFormatter('em')
                    terms = [token.text for token in sa(query_string)]

                    whoosh_result = whoosh_highlight(
                        additional_fields.get(self.content_field_name), terms,
                        sa, ContextFragmenter(), formatter)
                    additional_fields['highlighted'] = {
                        self.content_field_name: [whoosh_result],
                    }

                result = result_class(app_label, model_name,
                                      raw_result[DJANGO_ID], score,
                                      **additional_fields)
                results.append(result)
            else:
                hits -= 1

        if self.include_spelling:
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(
                    spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(
                    query_string)

        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }
Beispiel #8
0
 def highlight(self, text, words):
     fragmenter = ContextFragmenter()
     formatter = HtmlFormatter()
     analyzer = self.project_schema['text'].analyzer
     return highlight(text, words, analyzer, fragmenter, formatter, top=1)
Beispiel #9
0
 def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None):
     from haystack import site
     results = []
     
     # It's important to grab the hits first before slicing. Otherwise, this
     # can cause pagination failures.
     hits = len(raw_page)
     
     facets = {}
     spelling_suggestion = None
     indexed_models = site.get_indexed_models()
     
     for doc_offset, raw_result in enumerate(raw_page):
         score = raw_page.score(doc_offset) or 0
         app_label, model_name = raw_result['django_ct'].split('.')
         additional_fields = {}
         model = get_model(app_label, model_name)
         
         if model and model in indexed_models:
             for key, value in raw_result.items():
                 index = site.get_index(model)
                 string_key = str(key)
                 
                 if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
                     # Special-cased due to the nature of KEYWORD fields.
                     if isinstance(index.fields[string_key], MultiValueField):
                         if value is None or len(value) is 0:
                             additional_fields[string_key] = []
                         else:
                             additional_fields[string_key] = value.split(',')
                     else:
                         additional_fields[string_key] = index.fields[string_key].convert(value)
                 else:
                     additional_fields[string_key] = self._to_python(value)
             
             del(additional_fields['django_ct'])
             del(additional_fields['django_id'])
             
             if highlight:
                 from whoosh import analysis
                 from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter
                 sa = analysis.StemmingAnalyzer()
                 terms = [term.replace('*', '') for term in query_string.split()]
                 
                 additional_fields['highlighted'] = {
                     self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())],
                 }
             
             result = SearchResult(app_label, model_name, raw_result['django_id'], score, **additional_fields)
             results.append(result)
         else:
             hits -= 1
     
     if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False):
         if spelling_query:
             spelling_suggestion = self.create_spelling_suggestion(spelling_query)
         else:
             spelling_suggestion = self.create_spelling_suggestion(query_string)
     
     return {
         'results': results,
         'hits': hits,
         'facets': facets,
         'spelling_suggestion': spelling_suggestion,
     }
Beispiel #10
0
    def _process_results(
        self,
        raw_page,
        highlight=False,
        query_string="",
        spelling_query=None,
        result_class=None,
        facet_types=None,
    ):
        from haystack import connections

        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)

        if result_class is None:
            result_class = SearchResult

        spelling_suggestion = None
        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        facets = {}

        if facet_types:
            facets = {
                "fields": {},
                "dates": {},
                "queries": {},
            }
            for facet_fieldname in raw_page.results.facet_names():
                group = raw_page.results.groups(facet_fieldname)
                facet_type = facet_types[facet_fieldname]

                # Extract None item for later processing, if present.
                none_item = group.pop(None, None)

                lst = facets[facet_type][facet_fieldname] = sorted(
                    group.items(), key=(lambda itm: (-itm[1], itm[0])))

                if none_item is not None:
                    # Inject None item back into the results.
                    none_entry = (None, none_item)
                    if not lst or lst[-1][1] >= none_item:
                        lst.append(none_entry)
                    else:
                        for i, value in enumerate(lst):
                            if value[1] < none_item:
                                lst.insert(i, none_entry)
                                break

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split(".")
            additional_fields = {}
            model = haystack_get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = unified_index.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(
                            index.fields[string_key], "convert"):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) == 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(
                                    ",")
                        else:
                            additional_fields[string_key] = index.fields[
                                string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del additional_fields[DJANGO_CT]
                del additional_fields[DJANGO_ID]

                if highlight:
                    sa = StemmingAnalyzer()
                    formatter = WhooshHtmlFormatter("em")
                    terms = [token.text for token in sa(query_string)]

                    whoosh_result = whoosh_highlight(
                        additional_fields.get(self.content_field_name),
                        terms,
                        sa,
                        ContextFragmenter(),
                        formatter,
                    )
                    additional_fields["highlighted"] = {
                        self.content_field_name: [whoosh_result]
                    }

                result = result_class(app_label, model_name,
                                      raw_result[DJANGO_ID], score,
                                      **additional_fields)
                results.append(result)
            else:
                hits -= 1

        if self.include_spelling:
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(
                    spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(
                    query_string)

        return {
            "results": results,
            "hits": hits,
            "facets": facets,
            "spelling_suggestion": spelling_suggestion,
        }
Beispiel #11
0
 def fragment_tokens(self, text, tokens):
     return ContextFragmenter.fragment_tokens(self, text, wrap_tokens(tokens))
Beispiel #12
0
    def __init__(self,
                 whoosh_index_dir='',
                 use_cache=True,
                 cache_host='localhost',
                 cache_port=6379,
                 **kwargs):
        """
        Constructor for the engine.
        """
        Engine.__init__(self, **kwargs)

        self.whoosh_index_dir = whoosh_index_dir
        if not self.whoosh_index_dir:
            raise EngineConnectionException(
                self.name,
                "'whoosh_index_dir=' keyword argument not specified")

        #  Only put PL2 in for now (for more, add the model parameter to the constructor to specify!)
        self.scoring_model_identifier = 1
        self.scoring_model = scoring.PL2(c=10.0)

        try:
            self.doc_index = open_dir(self.whoosh_index_dir)
            self.reader = self.doc_index.reader()
            self.parser = QueryParser(
                'content',
                self.doc_index.schema)  # By default, we use AND grouping.
            # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc...

            #  Objects required for document snippet generation
            self.analyzer = self.doc_index.schema[
                self.parser.fieldname].analyzer
            self.fragmenter = ContextFragmenter(maxchars=200, surround=40)
            self.formatter = HtmlFormatter()
        except EmptyIndexError:
            message = "Could not open Whoosh index at '{0}'".format(
                self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)
        except OSError:
            message = "Could not open Whoosh index at '{0}' - directory does not exist".format(
                self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)

        self.use_cache = use_cache
        if self.use_cache:
            self.cache = RedisConn(host=cache_host, port=cache_port)
            self.cache.connect()

            self.page_cache_forward_look = 40  # How many additional pages to cache when required.
            self.page_cache_when = 4  # When the user is x pages away from the end of the page cache, cache more pages.

            self.page_cache_controller = PageCacheController(
                cache_host=self.cache.host,
                cache_port=self.cache.port,
                whoosh_index=self.doc_index,
                scoring_model_identifier=self.scoring_model_identifier,
                parser=self.parser,
                analyzer=self.analyzer,
                fragmenter=self.fragmenter,
                formatter=self.formatter,
                cache_forward_look=self.page_cache_forward_look)
    def _process_results(
        self,
        raw_page,
        highlight=False,
        query_string="",
        spelling_query=None,
        result_class=None,
    ):
        from haystack import connections

        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)

        if result_class is None:
            result_class = SearchResult

        spelling_suggestion = None
        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()
        
        facets = {}

        if len(raw_page.results.facet_names()):
            facets = {
                'fields': {},
                'dates': {},
                'queries': {},
            }
            for facet_fieldname in raw_page.results.facet_names():
                # split up the list and filter out None-names so we can
                # sort them in python3 without getting a type error
                facet_items = []
                facet_none = []
                for name, value in raw_page.results.groups(facet_fieldname).items():
                    if name is not None:
                        facet_items.append((name, len(value)))
                    else:
                        facet_none.append((name, len(value)))
                facet_items.sort(key=operator.itemgetter(1, 0), reverse=True)
                facets['fields'][facet_fieldname] = facet_items + facet_none

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split(".")
            additional_fields = {}
            model = haystack_get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = unified_index.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(
                        index.fields[string_key], "convert"
                    ):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) is 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(",")
                        else:
                            additional_fields[string_key] = index.fields[
                                string_key
                            ].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])

                if highlight:
                    sa = StemmingAnalyzer()
                    formatter = WhooshHtmlFormatter("em")
                    terms = [token.text for token in sa(query_string)]

                    whoosh_result = whoosh_highlight(
                        additional_fields.get(self.content_field_name),
                        terms,
                        sa,
                        ContextFragmenter(),
                        formatter,
                    )
                    additional_fields["highlighted"] = {
                        self.content_field_name: [whoosh_result]
                    }

                result = result_class(
                    app_label,
                    model_name,
                    raw_result[DJANGO_ID],
                    score,
                    **additional_fields
                )
                results.append(result)
            else:
                hits -= 1

        if self.include_spelling:
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(query_string)

        return {
            "results": results,
            "hits": hits,
            "facets": facets,
            "spelling_suggestion": spelling_suggestion,
        }
Beispiel #14
0
		if n > max_limit:
			n = max_limit
		
		ranked_results = sorted(doc_scores.iteritems(), key=operator.itemgetter(1), reverse=True)
		
		print "{0}  {1}  {2}".format(query_num, len(ranked_results), n)
		
		if n > 0:
			for rank in range(n):
				trec_docid = reader.stored_fields(ranked_results[rank][0])['docid']
				score_formatted = "{0:.6f}".format(ranked_results[rank][1])
				output_file.write("{0} Q0 {1} {2} {3} Exp{4}".format(query_num, trec_docid, (rank + 1), score_formatted, os.linesep))
				
				content = reader.stored_fields(ranked_results[rank][0])['content']
				
				if isinstance(whoosh_query, str):
					termz = [unicode(whoosh_query)]
				else:
					termz = [text for fieldname, text in whoosh_query.all_terms() if fieldname == 'content']
				
				from whoosh.highlight import highlight
				
				analyzer = ix.schema['content'].analyzer
				fragmenter = ContextFragmenter()
				formatter = HtmlFormatter()
				
				#print highlight(content, termz, analyzer, fragmenter, formatter)

ix.close()
input_file.close()
output_file.close()
Beispiel #15
0
 def __init__(self, markwiki_home):
     self.index_dir = os.path.join(markwiki_home, 'search')
     self._context_fragmenter = ContextFragmenter(maxchars=300, surround=55)
     # Whoosh convention prefers 'ix' for index.
     self._ix = None
Beispiel #16
0
    def _process_results(self,
                         raw_page,
                         highlight=False,
                         query_string='',
                         spelling_query=None,
                         result_class=None):
        from haystack import connections
        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)

        if result_class is None:
            result_class = SearchResult

        facets = {}
        spelling_suggestion = None
        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = apps.get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = unified_index.get_index(model)
                    string_key = str(key)

                    #                    if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
                    # Patch this up to exclude BooleanField because it's broken
                    # https://github.com/toastdriven/django-haystack/issues/382
                    if (string_key in index.fields and callable(
                            getattr(index.fields[string_key], 'convert', None))
                            and not isinstance(index.fields[string_key],
                                               BooleanField)):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) is 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(
                                    ',')
                        else:
                            additional_fields[string_key] = index.fields[
                                string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])

                if highlight:
                    from whoosh import analysis
                    from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter
                    sa = analysis.StemmingAnalyzer()
                    terms = [
                        term.replace('*', '') for term in query_string.split()
                    ]

                    additional_fields['highlighted'] = {
                        self.content_field_name: [
                            highlight(
                                additional_fields.get(self.content_field_name),
                                terms, sa, ContextFragmenter(terms),
                                UppercaseFormatter())
                        ],
                    }

                result = result_class(app_label, model_name,
                                      raw_result[DJANGO_ID], score,
                                      **additional_fields)
                results.append(result)
            else:
                hits -= 1

        if self.include_spelling:
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(
                    spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(
                    query_string)

        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }
Beispiel #17
0
#INDEX SCHEMA DEFINITION
SCHEMA = Schema(fileid=ID(unique=True),
                owner=TEXT(),
                repository=TEXT(stored=True),
                path=TEXT(stored=True),
                content=FieldType(format=Characters(),
                                  analyzer=ANALYZER,
                                  scorable=True,
                                  stored=True),
                modtime=STORED(),
                extension=TEXT(stored=True))

IDX_NAME = 'HG_INDEX'
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
FRAGMENTER = ContextFragmenter(200)

CHGSETS_SCHEMA = Schema(
    raw_id=ID(unique=True, stored=True),
    date=NUMERIC(stored=True),
    last=BOOLEAN(),
    owner=TEXT(),
    repository=ID(unique=True, stored=True),
    author=TEXT(stored=True),
    message=FieldType(format=Characters(),
                      analyzer=ANALYZER,
                      scorable=True,
                      stored=True),
    parents=TEXT(),
    added=TEXT(),
    removed=TEXT(),
Beispiel #18
0
def run_query(query, index):
    """
      Queries the index for data with the given text query

        @param  query   The text query to perform on the indexed data
        @return			A list of HTMl string snippets to return
    """

    # Create a searcher object for this index
    searcher = index.searcher()

    # Create a query parser that will parse multiple fields of the documents
    field_boosts = {'content': 1.0, 'title': 3.0}
    query_parser = MultifieldParser(['content', 'title'],
                                    schema=index_schema,
                                    fieldboosts=field_boosts,
                                    group=OrGroup)

    # Build a query object from the query string
    query_object = query_parser.parse(query)

    # Build a spell checker in this index and add the "content" field to the spell checker
    spell_checker = SpellChecker(index.storage)
    spell_checker.add_field(index, 'content')
    spell_checker.add_field(index, 'title')

    # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results
    search_terms = [text for fieldname, text in query_object.all_terms()]

    # Remove terms that are too short
    for search_term in search_terms:
        if len(search_term) <= 3:
            search_terms.remove(search_term)

    # Perform the query itself
    search_results = searcher.search(query_object)

    # Get an analyzer for analyzing the content of each page for highlighting
    analyzer = index_schema['content'].format.analyzer

    # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts
    #   by 'context' in the content
    fragmenter = ContextFragmenter(frozenset(search_terms))

    # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to
    #   highlight the results
    formatter = HtmlFormatter()

    # Iterate through the search results, highlighting and counting the results
    result_count = 0
    results = []
    for search_result in search_results:
        # Collect this search result
        results.append({
            'content':
            highlight(search_result['content'], search_terms, analyzer,
                      fragmenter, formatter),
            'url':
            search_result['url'],
            'title':
            search_result['title']
        })
        result_count += 1

    # Build a list of 'suggest' words using the spell checker
    suggestions = []
    for term in search_terms:
        suggestions.append(spell_checker.suggest(term))

    # Return the list of web pages along with the terms used in the search
    return results, search_terms, suggestions, result_count