Example #1
0
    def search(self, query: str, frequencies=False):
        """
        Get the indices of the documents matching the query
        :param query: The whoosh query string
        :param frequencies: If true, return pairs of (docnum, frequency) rather than only docnum
        :return: sequence of document numbers (and freqs, if frequencies is True)
        """

        with self.index.searcher(weighting=scoring.Frequency) as searcher:
            if frequencies:
                ## for some reason, using searcher.search counts all individual occurrences of the terms in a phrase ("term1 term2")
                ## after the phrase occurs at least once. So for frequencies, we use this lengthy alternative
                ## (I expect that somewhere a setting is hidden to simply fix this with searcher.search, but no clue yet)
                results = defaultdict(lambda:float(0))
                queries = divide_query(query)

                for i, q in enumerate(queries):
                    q = QueryParser("text", self.index.schema).parse(q)
                    matcher = q.matcher(searcher)

                    while matcher.is_active():
                        docnum = searcher.reader().stored_fields(matcher.id())['doc_i']
                        bd = boostdict(matcher)
                        for s in matcher.spans():
                            results[docnum] += bd[s] if s in bd else 1
                        matcher.next()
                return [(k,v) for k,v in results.items()]
            else:
                query = QueryParser("text", self.index.schema).parse(query)
                results = searcher.search(query, limit=None, scored=False, sortedby=None)
                return [results[i]['doc_i'] for i in range(len(results))]