def extract(self, analyzer, str): tokenStream = analyzer.tokenStream('content', StringReader(str)) termAttr = tokenStream.addAttribute(lucene.findClass('org/apache/lucene/analysis/tokenattributes/CharTermAttribute')) keywords = [] tokenStream.reset() while tokenStream.incrementToken(): keywords.append(termAttr.toString()) return keywords
def debug(cls, analyzer, str): tokenStream = analyzer.tokenStream('debug', StringReader(str)) termAttr = tokenStream.addAttribute(lucene.findClass('org/apache/lucene/analysis/tokenattributes/CharTermAttribute')) str = '' tokenStream.reset() while tokenStream.incrementToken(): str = str + ('[%s] ' % termAttr.toString()) return str
def search_interactive_history(self): doc = None while True: print() print('Hit enter with no input to quit.') command = raw_input('Query:') if command == '': return doc print() print('Searching for:'.format(command)) query = TermQuery(Term('history', command)) hits = self._searcher_ngram.search(query, 1000).scoreDocs print('{} total matching documents.'.format(len(hits))) for i, hit in enumerate(hits): doc = self._searcher_ngram.doc(hit.doc) print('doc {}:'.format(i)) for f in doc: print(type(f)) print('\t{}'.format(f.toString().encode('utf-8'))) # print 'text:', doc.get("text") if i >= 30: break if __name__ == '__main__': try: lucene.findClass('org/apache/lucene/document/Document') except: lucene.initVM(vmargs=['-Djava.awt.headless=true']) print('lucene {}'.format(lucene.VERSION))