Ejemplo n.º 1
0
 def __init__(self, lucene_results, searcher, compressor, query):
     self.lucene_results = lucene_results
     self.searcher = searcher
     self.compressor = compressor
     scorer = lucene.QueryScorer(query)
     self.highlighter = lucene.Highlighter(scorer)
     self.analyzer = BlogCorpusAnalyzer()
     fragmenter = lucene.SimpleFragmenter(10000)
     self.highlighter.setTextFragmenter(fragmenter)
     self.length = min(self.lucene_results.totalHits, MAX_RESULTS)
Ejemplo n.º 2
0
class SearchResults(object):

    def __init__(self, lucene_results, searcher, compressor, query):
        self.lucene_results = lucene_results
        self.searcher = searcher
        self.compressor = compressor
        scorer = lucene.QueryScorer(query)
        self.highlighter = lucene.Highlighter(scorer)
        self.analyzer = BlogCorpusAnalyzer()
        fragmenter = lucene.SimpleFragmenter(10000)
        self.highlighter.setTextFragmenter(fragmenter)
        self.length = min(self.lucene_results.totalHits, MAX_RESULTS)

    def __len__(self):
        return self.length

    def uncompress_contents(self, doc):
        unicode_str = doc.getField('compressed').stringValue()
        # hack, because not indexed as a binary field. fix that
        normal_str = ''.join(chr(ord(x)) for x in unicode_str.encode('utf8'))
        return self.compressor.decompress(normal_str)

    def get_doc(self, number):

        scoredoc = self.lucene_results.scoreDocs[number]
        doc = self.searcher.doc(scoredoc.doc)
        if self.compressor is not None:
            contents = self.uncompress_contents(doc)
        else:
            contents = doc.getField('contents').stringValue()

        tokenStream = self.analyzer.tokenStream('f', 
                lucene.StringReader(contents))
        highlighted = self.highlighter.getBestFragment(tokenStream, contents)

        # TODO: fix this ridiculous thing
        if highlighted is None:
            return Result(['ERROR', 'ERROR', 'ERROR'], 1, 2)

        words = highlighted.split()
        actual_words = []
        highlighted_words = []
        for index, word in enumerate(words):
            splitted = word.split('@')
            wordform = (splitted[0] if splitted[0] != '' and 
                    splitted[0] != '<B>' else splitted[1])
            if word.startswith('<B>'):
                highlighted_words.append(index)
            actual_words.append(wordform.replace('<B>', ''))

        # Take only first continguous strech of highlighted words
        for i in range(0, len(highlighted_words) - 1):
            if highlighted_words[i+1] - highlighted_words[i] > 1:
                highlighted_words = highlighted_words[:i+1]
                break

        start = highlighted_words[0]
        end = highlighted_words[-1]
        # Make sure highlighted words are contiguous
        assert len(highlighted_words) == end - start + 1 
        return Result(actual_words, start, end)

    def __getitem__(self, item):

        if isinstance(item, slice):
            results = []
            for i in range(item.start, item.stop):
                d = self.get_doc(i)
                if 'ERROR' not in d.before:
                    results.append(d)
            return results
        else:
            return self.get_doc(item)