Ejemplo n.º 1
0
	def extract(self, analyzer, str):

		tokenStream = analyzer.tokenStream('content', StringReader(str))
		termAttr = tokenStream.addAttribute(lucene.findClass('org/apache/lucene/analysis/tokenattributes/CharTermAttribute'))
		
		keywords = []
		tokenStream.reset()
		while tokenStream.incrementToken():
			
			keywords.append(termAttr.toString())

		return keywords
Ejemplo n.º 2
0
	def debug(cls, analyzer, str):

		tokenStream = analyzer.tokenStream('debug', StringReader(str))
		termAttr = tokenStream.addAttribute(lucene.findClass('org/apache/lucene/analysis/tokenattributes/CharTermAttribute'))
		
		str = ''
		tokenStream.reset()
		while tokenStream.incrementToken():
			
			str = str + ('[%s] ' % termAttr.toString())

		return str
Ejemplo n.º 3
0
    def search_interactive_history(self):
        doc = None
        while True:
            print()
            print('Hit enter with no input to quit.')
            command = raw_input('Query:')
            if command == '':
                return doc
            print()
            print('Searching for:'.format(command))
            query = TermQuery(Term('history', command))
            hits = self._searcher_ngram.search(query, 1000).scoreDocs
            print('{} total matching documents.'.format(len(hits)))
            for i, hit in enumerate(hits):
                doc = self._searcher_ngram.doc(hit.doc)
                print('doc {}:'.format(i))
                for f in doc:
                    print(type(f))
                    print('\t{}'.format(f.toString().encode('utf-8')))


#                print 'text:', doc.get("text")
                if i >= 30:
                    break

if __name__ == '__main__':
    try:
        lucene.findClass('org/apache/lucene/document/Document')
    except:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print('lucene {}'.format(lucene.VERSION))