def getTermVectors(route): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) ls = [] for doc in range(ireader.numDocs()): vector = FreqVector() vector.vector = [] vector.freqs = [] norm = 0.0 terms = ireader.getTermVector(doc, "content") if (terms is not None): termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() tf = 1 + math.log(termsEnum.totalTermFreq(), 2) t = Term("content", term) idf = math.log(ireader.numDocs() / ireader.docFreq(t)) vector.vector.append(text) vector.freqs.append(tf * idf) norm += (tf * idf) * (tf * idf) ls.append((vector, math.sqrt(norm))) else: ls.append((vector, 0)) return ls
def getMostFrequentTermNoStopwords(route, query): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) currentTerm = "" currentTermFreq = 0 for doc in range(ireader.numDocs()): terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() t = Term("content", term) freq = ireader.totalTermFreq(t) if freq > currentTermFreq and text not in query: currentTerm = text currentTermFreq = freq return currentTerm
def getMostFrequentTermStopwords(route, query): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "no", "not", "more", "http", "html", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "el", "la", "lo", "los", "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y", "los" ] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) currentTerm = "" currentTermFreq = 0 for doc in range(ireader.numDocs()): terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString().encode('UTF-8') t = Term("content", term) freq = ireader.totalTermFreq(t) if freq > currentTermFreq and text not in query: currentTerm = text currentTermFreq = freq return currentTerm
def testStop(self): a = StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET) self._assertAnalyzesTo(a, "foo bar FOO BAR", ["foo", "bar", "foo", "bar"]) self._assertAnalyzesTo(a, "foo a bar such FOO THESE BAR", ["foo", "bar", "foo", "bar"])
def createIndexStopwords(texts, route, rebuild): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "el", "la", "lo", "los", "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y", "los" ] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) for key in texts: doc = Document() doc.add( Field("docName", key.__str__(), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("content", texts[key], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close()
def testStopList(self): stopWords = ["good", "test", "analyzer"] stopWordsSet = StopFilter.makeStopSet(stopWords) newStop = StopAnalyzer(stopWordsSet) reader = StringReader("This is a good test of the english stop analyzer") stream = newStop.tokenStream("test", reader) self.assert_(stream is not None) stream.reset() termAtt = stream.getAttribute(CharTermAttribute.class_) while stream.incrementToken(): text = termAtt.toString() self.assert_(text not in stopWordsSet)
def testStop(self): a = StopAnalyzer() self._assertAnalyzesTo(a, "foo bar FOO BAR", ["foo", "bar", "foo", "bar"]) self._assertAnalyzesTo(a, "foo a bar such FOO THESE BAR", ["foo", "bar", "foo", "bar"])
def testStop(self): a = StopAnalyzer(Version.LUCENE_CURRENT) self._assertAnalyzesTo(a, "foo bar FOO BAR", ["foo", "bar", "foo", "bar"]) self._assertAnalyzesTo(a, "foo a bar such FOO THESE BAR", ["foo", "bar", "foo", "bar"])
def createIndexNoStopwords(texts, route, rebuild): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) for key in texts: doc = Document() doc.add( Field("docName", "doc", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("content", texts[key], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close()
def __init__(self): # self.analyzer = StandardAnalyzer() # self.analyzer = PersianAnalyzer(StopFilter.makeStopSet(sw)) # self.analyzer = PersianAnalyzer() self.analyzer = StopAnalyzer(Paths.get(Config.stop_words_address)) self.config = IndexWriterConfig(self.analyzer) self.index = RAMDirectory() self.w = IndexWriter(self.index, self.config)
def testStopListPositions(self): stopWords = ["good", "test", "analyzer"] stopWordsSet = StopFilter.makeStopSet(stopWords) newStop = StopAnalyzer(stopWordsSet) reader = StringReader("This is a good test of the english stop analyzer with positions") expectedIncr = [ 1, 1, 1, 3, 1, 1, 1, 2, 1] stream = newStop.tokenStream("test", reader) self.assert_(stream is not None) stream.reset() i = 0 termAtt = stream.getAttribute(CharTermAttribute.class_) posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class_) while stream.incrementToken(): text = termAtt.toString() self.assert_(text not in stopWordsSet) self.assertEqual(expectedIncr[i], posIncrAtt.getPositionIncrement()) i += 1
def testPhraseQueryWithStopAnalyzer(self): writer = self.getWriter(analyzer=StopAnalyzer(Version.LUCENE_CURRENT)) doc = Document() doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() # valid exact phrase query query = PhraseQuery() query.add(Term("field", "stop")) query.add(Term("field", "words")) scoreDocs = searcher.search(query, None, 50).scoreDocs self.assertEqual(1, len(scoreDocs))
def testPhraseQueryWithStopAnalyzer(self): writer = self.getWriter(analyzer=StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)) doc = Document() doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() # valid exact phrase query builder = PhraseQuery.Builder() builder.add(Term("field", "stop")) builder.add(Term("field", "words")) scoreDocs = searcher.search(builder.build(), 50).scoreDocs self.assertEqual(1, len(scoreDocs))
def getFreqVectorFromText(self, text): # Initialization of Java Virtual Machine with Lucene vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() indexDir = "res/index" stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(indexDir)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) rebuild = True if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) doc = Document() doc.add(Field("docName", 'url', Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("content", text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close() ireader = IndexReader.open(directory) freqVector = [] docVector = ireader.getTermVector(0, "content") termsEnum = docVector.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() freq = termsEnum.totalTermFreq() freqVector.append((text, freq)) freqVector = sorted(freqVector, key=itemgetter(1), reverse=True) self.vector = list() self.freqs = list() for el in freqVector: self.vector.append(el[0]) self.freqs.append(el[1])
class StopAnalyzerTestCase(unittest.TestCase): """ Unit tests ported from Java Lucene """ def setUp(self): self.stop = StopAnalyzer() self.invalidTokens = StopAnalyzer.ENGLISH_STOP_WORDS_SET def testDefaults(self): self.assert_(self.stop is not None) reader = StringReader("This is a test of the english stop analyzer") stream = self.stop.tokenStream("test", reader) self.assert_(stream is not None) stream.reset() termAtt = stream.getAttribute(CharTermAttribute.class_) while stream.incrementToken(): self.assert_(termAtt.toString() not in self.invalidTokens) def testStopList(self): stopWords = ["good", "test", "analyzer"] stopWordsSet = StopFilter.makeStopSet(stopWords) newStop = StopAnalyzer(stopWordsSet) reader = StringReader("This is a good test of the english stop analyzer") stream = newStop.tokenStream("test", reader) self.assert_(stream is not None) stream.reset() termAtt = stream.getAttribute(CharTermAttribute.class_) while stream.incrementToken(): text = termAtt.toString() self.assert_(text not in stopWordsSet) def testStopListPositions(self): stopWords = ["good", "test", "analyzer"] stopWordsSet = StopFilter.makeStopSet(stopWords) newStop = StopAnalyzer(stopWordsSet) reader = StringReader("This is a good test of the english stop analyzer with positions") expectedIncr = [ 1, 1, 1, 3, 1, 1, 1, 2, 1] stream = newStop.tokenStream("test", reader) self.assert_(stream is not None) stream.reset() i = 0 termAtt = stream.getAttribute(CharTermAttribute.class_) posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class_) while stream.incrementToken(): text = termAtt.toString() self.assert_(text not in stopWordsSet) self.assertEqual(expectedIncr[i], posIncrAtt.getPositionIncrement()) i += 1
def setUp(self): self.stop = StopAnalyzer(Version.LUCENE_CURRENT) self.invalidTokens = StopAnalyzer.ENGLISH_STOP_WORDS_SET
def setUp(self): self.stop = StopAnalyzer() self.invalidTokens = StopAnalyzer.ENGLISH_STOP_WORDS_SET
def parseQuery(myQuery): parser = QueryParser("", StopAnalyzer()) parsedQuery = parser.parse(myQuery) myQueryTerms = parsedQuery.toString().split(" ") return myQueryTerms