def createIndexNoStopwords(texts, route, rebuild): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) for key in texts: doc = Document() doc.add( Field("docName", "doc", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("content", texts[key], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close()
def getMostFrequentTermStopwords(route, query): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "no", "not", "more", "http", "html", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "el", "la", "lo", "los", "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y", "los" ] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) currentTerm = "" currentTermFreq = 0 for doc in range(ireader.numDocs()): terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString().encode('UTF-8') t = Term("content", term) freq = ireader.totalTermFreq(t) if freq > currentTermFreq and text not in query: currentTerm = text currentTermFreq = freq return currentTerm
def getTermVectors(route): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) ls = [] for doc in range(ireader.numDocs()): vector = FreqVector() vector.vector = [] vector.freqs = [] norm = 0.0 terms = ireader.getTermVector(doc, "content") if (terms is not None): termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() tf = 1 + math.log(termsEnum.totalTermFreq(), 2) t = Term("content", term) idf = math.log(ireader.numDocs() / ireader.docFreq(t)) vector.vector.append(text) vector.freqs.append(tf * idf) norm += (tf * idf) * (tf * idf) ls.append((vector, math.sqrt(norm))) else: ls.append((vector, 0)) return ls
def createIndexStopwords(texts, route, rebuild): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "el", "la", "lo", "los", "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y", "los" ] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) for key in texts: doc = Document() doc.add( Field("docName", key.__str__(), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("content", texts[key], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close()
def getMostFrequentTermNoStopwords(route, query): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) currentTerm = "" currentTermFreq = 0 for doc in range(ireader.numDocs()): terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() t = Term("content", term) freq = ireader.totalTermFreq(t) if freq > currentTermFreq and text not in query: currentTerm = text currentTermFreq = freq return currentTerm
def getTermVectors(route): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) ls = [] for doc in range(ireader.numDocs()): vector = FreqVector() vector.vector = [] vector.freqs = [] norm = 0.0 terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() tf = 1 + math.log(termsEnum.totalTermFreq(), 2) t = Term("content", term) idf = math.log(ireader.numDocs() / ireader.docFreq(t)) vector.vector.append(text) vector.freqs.append(tf * idf) norm += (tf * idf) * (tf * idf) ls.append((vector, math.sqrt(norm))) else: ls.append((vector, 0)) return ls
def getMostFrequentTermNoStopwords(route, query): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) currentTerm = "" currentTermFreq = 0 for doc in range(ireader.numDocs()): terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() t = Term("content", term) freq = ireader.totalTermFreq(t) if freq > currentTermFreq and text not in query: currentTerm = text currentTermFreq = freq return currentTerm
def testStopList(self): stopWords = ["good", "test", "analyzer"] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) newStop = StopAnalyzer(Version.LUCENE_40, stopWordsSet) reader = StringReader("This is a good test of the english stop analyzer") stream = newStop.tokenStream("test", reader) self.assert_(stream is not None) stream.reset() termAtt = stream.getAttribute(CharTermAttribute.class_) while stream.incrementToken(): text = termAtt.toString() self.assert_(text not in stopWordsSet)
def getFreqVectorFromText(self, text): # Initialization of Java Virtual Machine with Lucene vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() indexDir = "res/index" stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(indexDir)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) rebuild = True if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) doc = Document() doc.add(Field("docName", 'url', Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("content", text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close() ireader = IndexReader.open(directory) freqVector = [] docVector = ireader.getTermVector(0, "content") termsEnum = docVector.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() freq = termsEnum.totalTermFreq() freqVector.append((text, freq)) freqVector = sorted(freqVector, key=itemgetter(1), reverse=True) self.vector = list() self.freqs = list() for el in freqVector: self.vector.append(el[0]) self.freqs.append(el[1])
def testStopListPositions(self): stopWords = ["good", "test", "analyzer"] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) newStop = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) reader = StringReader("This is a good test of the english stop analyzer with positions") expectedIncr = [ 1, 1, 1, 3, 1, 1, 1, 2, 1] stream = newStop.tokenStream("test", reader) self.assert_(stream is not None) stream.reset() i = 0 termAtt = stream.getAttribute(CharTermAttribute.class_) posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class_) while stream.incrementToken(): text = termAtt.toString() self.assert_(text not in stopWordsSet) self.assertEqual(expectedIncr[i], posIncrAtt.getPositionIncrement()) i += 1
def createIndexNoStopwords(texts, route, rebuild): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) for key in texts: doc = Document() doc.add(Field("docName", "doc", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("content", texts[key], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close()
def setUp(self): stopWords = ['the', 'and', 's'] self.stop_set = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) self.reader = StringReader('foo')
def setUp(self): stopWords = ['the', 'and', 's'] self.stop_set = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) self.reader = StringReader('foo')
def createIndexStopwords(texts, route, rebuild): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "el", "la", "lo", "los", "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y", "los", ] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) for key in texts: doc = Document() doc.add(Field("docName", key.__str__(), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("content", texts[key], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close()
def getMostFrequentTermStopwords(route, query): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "no", "not", "more", "http", "html", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "el", "la", "lo", "los", "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y", "los", ] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) currentTerm = "" currentTermFreq = 0 for doc in range(ireader.numDocs()): terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString().encode("UTF-8") t = Term("content", term) freq = ireader.totalTermFreq(t) if freq > currentTermFreq and text not in query: currentTerm = text currentTermFreq = freq return currentTerm