def getMostFrequentTermNoStopwords(route, query): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) currentTerm = "" currentTermFreq = 0 for doc in range(ireader.numDocs()): terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() t = Term("content", term) freq = ireader.totalTermFreq(t) if freq > currentTermFreq and text not in query: currentTerm = text currentTermFreq = freq return currentTerm
def getMostFrequentTermStopwords(route, query): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "no", "not", "more", "http", "html", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "el", "la", "lo", "los", "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y", "los" ] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) currentTerm = "" currentTermFreq = 0 for doc in range(ireader.numDocs()): terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString().encode('UTF-8') t = Term("content", term) freq = ireader.totalTermFreq(t) if freq > currentTermFreq and text not in query: currentTerm = text currentTermFreq = freq return currentTerm
def testStopWords(self): try: result = StandardTokenizer(Version.LUCENE_CURRENT, self.reader) result = StopFilter(Version.LUCENE_CURRENT, result, self.stop_set) except Exception, e: self.fail(str(e))
def getTermVectors(route): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) ls = [] for doc in range(ireader.numDocs()): vector = FreqVector() vector.vector = [] vector.freqs = [] norm = 0.0 terms = ireader.getTermVector(doc, "content") if (terms is not None): termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() tf = 1 + math.log(termsEnum.totalTermFreq(), 2) t = Term("content", term) idf = math.log(ireader.numDocs() / ireader.docFreq(t)) vector.vector.append(text) vector.freqs.append(tf * idf) norm += (tf * idf) * (tf * idf) ls.append((vector, math.sqrt(norm))) else: ls.append((vector, 0)) return ls
def createIndexNoStopwords(texts, route, rebuild): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) for key in texts: doc = Document() doc.add( Field("docName", "doc", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("content", texts[key], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close()
def getTermVectors(route): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) ls = [] for doc in range(ireader.numDocs()): vector = FreqVector() vector.vector = [] vector.freqs = [] norm = 0.0 terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() tf = 1 + math.log(termsEnum.totalTermFreq(), 2) t = Term("content", term) idf = math.log(ireader.numDocs() / ireader.docFreq(t)) vector.vector.append(text) vector.freqs.append(tf * idf) norm += (tf * idf) * (tf * idf) ls.append((vector, math.sqrt(norm))) else: ls.append((vector, 0)) return ls
def createIndexStopwords(texts, route, rebuild): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "el", "la", "lo", "los", "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y", "los" ] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) for key in texts: doc = Document() doc.add( Field("docName", key.__str__(), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("content", texts[key], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close()
def createComponents(self, fieldName, reader): source = StandardTokenizer(Version.LUCENE_CURRENT, reader) filter = StandardFilter(Version.LUCENE_CURRENT, source) filter = LowerCaseFilter(Version.LUCENE_CURRENT, filter) filter = PorterStemFilter(filter) filter = StopFilter(Version.LUCENE_CURRENT, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET) return self.TokenStreamComponents(source, filter)
def preprocess(text): """Tokenize and stop the input text.""" ts = StandardTokenizer(Lucene.get_version(), StringReader(text.lower())) ts = StopFilter(Lucene.get_version(), ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET) string_builder = StringBuilder() ts.reset() char_term_attr = ts.addAttribute(CharTermAttribute.class_) while ts.incrementToken(): if string_builder.length() > 0: string_builder.append(" ") string_builder.append(char_term_attr.toString()) return string_builder.toString()
def testStopList(self): stopWords = ["good", "test", "analyzer"] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) newStop = StopAnalyzer(Version.LUCENE_40, stopWordsSet) reader = StringReader("This is a good test of the english stop analyzer") stream = newStop.tokenStream("test", reader) self.assert_(stream is not None) stream.reset() termAtt = stream.getAttribute(CharTermAttribute.class_) while stream.incrementToken(): text = termAtt.toString() self.assert_(text not in stopWordsSet)
def getFreqVectorFromText(self, text): # Initialization of Java Virtual Machine with Lucene vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() indexDir = "res/index" stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(indexDir)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) rebuild = True if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) doc = Document() doc.add(Field("docName", 'url', Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("content", text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close() ireader = IndexReader.open(directory) freqVector = [] docVector = ireader.getTermVector(0, "content") termsEnum = docVector.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() freq = termsEnum.totalTermFreq() freqVector.append((text, freq)) freqVector = sorted(freqVector, key=itemgetter(1), reverse=True) self.vector = list() self.freqs = list() for el in freqVector: self.vector.append(el[0]) self.freqs.append(el[1])
def testStopListPositions(self): stopWords = ["good", "test", "analyzer"] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) newStop = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) reader = StringReader("This is a good test of the english stop analyzer with positions") expectedIncr = [ 1, 1, 1, 3, 1, 1, 1, 2, 1] stream = newStop.tokenStream("test", reader) self.assert_(stream is not None) stream.reset() i = 0 termAtt = stream.getAttribute(CharTermAttribute.class_) posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class_) while stream.incrementToken(): text = termAtt.toString() self.assert_(text not in stopWordsSet) self.assertEqual(expectedIncr[i], posIncrAtt.getPositionIncrement()) i += 1
def createIndexNoStopwords(texts, route, rebuild): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) for key in texts: doc = Document() doc.add(Field("docName", "doc", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("content", texts[key], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close()
def createIndexStopwords(texts, route, rebuild): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "el", "la", "lo", "los", "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y", "los", ] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) for key in texts: doc = Document() doc.add(Field("docName", key.__str__(), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("content", texts[key], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close()
def setUp(self): stopWords = ['the', 'and', 's'] self.stop_set = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) self.reader = StringReader('foo')
def getMostFrequentTermStopwords(route, query): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "no", "not", "more", "http", "html", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "el", "la", "lo", "los", "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y", "los", ] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) currentTerm = "" currentTermFreq = 0 for doc in range(ireader.numDocs()): terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString().encode("UTF-8") t = Term("content", term) freq = ireader.totalTermFreq(t) if freq > currentTermFreq and text not in query: currentTerm = text currentTermFreq = freq return currentTerm