Example #1
0
    def getMostFrequentTermNoStopwords(route, query):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = []

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)

        currentTerm = ""
        currentTermFreq = 0
        for doc in range(ireader.numDocs()):
            terms = ireader.getTermVector(doc, "content")
            if terms is not None:
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString()
                    t = Term("content", term)
                    freq = ireader.totalTermFreq(t)
                    if freq > currentTermFreq and text not in query:
                        currentTerm = text
                        currentTermFreq = freq
        return currentTerm
Example #2
0
    def getMostFrequentTermStopwords(route, query):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = [
            "a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
            "if", "no", "not", "more", "http", "html", "of", "on", "or",
            "such", "that", "the", "their", "then", "there", "these", "they",
            "this", "to", "was", "will", "with", "el", "la", "lo", "los",
            "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y",
            "los"
        ]

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)

        currentTerm = ""
        currentTermFreq = 0
        for doc in range(ireader.numDocs()):
            terms = ireader.getTermVector(doc, "content")
            if terms is not None:
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString().encode('UTF-8')
                    t = Term("content", term)
                    freq = ireader.totalTermFreq(t)
                    if freq > currentTermFreq and text not in query:
                        currentTerm = text
                        currentTermFreq = freq

        return currentTerm
Example #3
0
    def testStopWords(self):

        try:
            result = StandardTokenizer(Version.LUCENE_CURRENT, self.reader)
            result = StopFilter(Version.LUCENE_CURRENT, result, self.stop_set)
        except Exception, e:
            self.fail(str(e))
Example #4
0
    def getTermVectors(route):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = []
        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)
        ls = []
        for doc in range(ireader.numDocs()):
            vector = FreqVector()
            vector.vector = []
            vector.freqs = []

            norm = 0.0
            terms = ireader.getTermVector(doc, "content")
            if (terms is not None):
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString()
                    tf = 1 + math.log(termsEnum.totalTermFreq(), 2)
                    t = Term("content", term)
                    idf = math.log(ireader.numDocs() / ireader.docFreq(t))
                    vector.vector.append(text)
                    vector.freqs.append(tf * idf)
                    norm += (tf * idf) * (tf * idf)
                ls.append((vector, math.sqrt(norm)))
            else:
                ls.append((vector, 0))
        return ls
Example #5
0
    def createIndexNoStopwords(texts, route, rebuild):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        stopWords = []

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))
        conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
        if rebuild:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        else:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)

        iwriter = IndexWriter(directory, conf)

        for key in texts:
            doc = Document()
            doc.add(
                Field("docName", "doc", Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(
                Field("content", texts[key], Field.Store.YES,
                      Field.Index.ANALYZED, Field.TermVector.YES))
            iwriter.addDocument(doc)

        iwriter.close()
Example #6
0
    def getTermVectors(route):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = []
        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)
        ls = []
        for doc in range(ireader.numDocs()):
            vector = FreqVector()
            vector.vector = []
            vector.freqs = []

            norm = 0.0
            terms = ireader.getTermVector(doc, "content")
            if terms is not None:
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString()
                    tf = 1 + math.log(termsEnum.totalTermFreq(), 2)
                    t = Term("content", term)
                    idf = math.log(ireader.numDocs() / ireader.docFreq(t))
                    vector.vector.append(text)
                    vector.freqs.append(tf * idf)
                    norm += (tf * idf) * (tf * idf)
                ls.append((vector, math.sqrt(norm)))
            else:
                ls.append((vector, 0))
        return ls
Example #7
0
    def createIndexStopwords(texts, route, rebuild):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        stopWords = [
            "a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
            "if", "no", "not", "of", "on", "or", "such", "that", "the",
            "their", "then", "there", "these", "they", "this", "to", "was",
            "will", "with", "el", "la", "lo", "los", "las", "ante", "con",
            "sin", "que", "es", "de", "en", "por", "y", "los"
        ]
        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))
        conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
        if rebuild:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        else:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)

        iwriter = IndexWriter(directory, conf)

        for key in texts:
            doc = Document()
            doc.add(
                Field("docName", key.__str__(), Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(
                Field("content", texts[key], Field.Store.YES,
                      Field.Index.ANALYZED, Field.TermVector.YES))
            iwriter.addDocument(doc)

        iwriter.close()
Example #8
0
    def getMostFrequentTermNoStopwords(route, query):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = []

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)

        currentTerm = ""
        currentTermFreq = 0
        for doc in range(ireader.numDocs()):
            terms = ireader.getTermVector(doc, "content")
            if terms is not None:
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString()
                    t = Term("content", term)
                    freq = ireader.totalTermFreq(t)
                    if freq > currentTermFreq and text not in query:
                        currentTerm = text
                        currentTermFreq = freq
        return currentTerm
Example #9
0
 def createComponents(self, fieldName, reader):
     source = StandardTokenizer(Version.LUCENE_CURRENT, reader)
     filter = StandardFilter(Version.LUCENE_CURRENT, source)
     filter = LowerCaseFilter(Version.LUCENE_CURRENT, filter)
     filter = PorterStemFilter(filter)
     filter = StopFilter(Version.LUCENE_CURRENT, filter,
     StopAnalyzer.ENGLISH_STOP_WORDS_SET)
     return self.TokenStreamComponents(source, filter)
Example #10
0
 def preprocess(text):
     """Tokenize and stop the input text."""
     ts = StandardTokenizer(Lucene.get_version(), StringReader(text.lower()))
     ts = StopFilter(Lucene.get_version(), ts,  StopAnalyzer.ENGLISH_STOP_WORDS_SET)
     string_builder = StringBuilder()
     ts.reset()
     char_term_attr = ts.addAttribute(CharTermAttribute.class_)
     while ts.incrementToken():
         if string_builder.length() > 0:
             string_builder.append(" ")
         string_builder.append(char_term_attr.toString())
     return string_builder.toString()
Example #11
0
    def testStopList(self):

        stopWords = ["good", "test", "analyzer"]
        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)

        newStop = StopAnalyzer(Version.LUCENE_40, stopWordsSet)
        reader = StringReader("This is a good test of the english stop analyzer")
        stream = newStop.tokenStream("test", reader)
        self.assert_(stream is not None)
        stream.reset()

        termAtt = stream.getAttribute(CharTermAttribute.class_)
    
        while stream.incrementToken():
            text = termAtt.toString()
            self.assert_(text not in stopWordsSet)
Example #12
0
    def getFreqVectorFromText(self, text):
        # Initialization of Java Virtual Machine with Lucene
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        indexDir = "res/index"

        stopWords = []

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords)

        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(indexDir))
        conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
        rebuild = True
        if rebuild:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        else:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)

        iwriter = IndexWriter(directory, conf)

        doc = Document()
        doc.add(Field("docName", 'url', Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("content", text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES))
        iwriter.addDocument(doc)
        iwriter.close()

        ireader = IndexReader.open(directory)

        freqVector = []
        docVector = ireader.getTermVector(0, "content")

        termsEnum = docVector.iterator(None)
        for term in BytesRefIterator.cast_(termsEnum):
            text = term.utf8ToString()
            freq = termsEnum.totalTermFreq()
            freqVector.append((text, freq))

        freqVector = sorted(freqVector, key=itemgetter(1), reverse=True)
        self.vector = list()
        self.freqs = list()
        for el in freqVector:
            self.vector.append(el[0])
            self.freqs.append(el[1])
Example #13
0
    def testStopListPositions(self):
        
        stopWords = ["good", "test", "analyzer"]
        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)

        newStop = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        reader = StringReader("This is a good test of the english stop analyzer with positions")
        expectedIncr = [ 1,   1, 1,          3, 1,  1,      1,            2,   1]
        stream = newStop.tokenStream("test", reader)
        self.assert_(stream is not None)
        stream.reset()

        i = 0
        termAtt = stream.getAttribute(CharTermAttribute.class_)
        posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class_)

        while stream.incrementToken():
            text = termAtt.toString()
            self.assert_(text not in stopWordsSet)
            self.assertEqual(expectedIncr[i],
                             posIncrAtt.getPositionIncrement())
            i += 1
Example #14
0
    def createIndexNoStopwords(texts, route, rebuild):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        stopWords = []

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))
        conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
        if rebuild:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        else:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)

        iwriter = IndexWriter(directory, conf)

        for key in texts:
            doc = Document()
            doc.add(Field("docName", "doc", Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(Field("content", texts[key], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES))
            iwriter.addDocument(doc)

        iwriter.close()
Example #15
0
    def createIndexStopwords(texts, route, rebuild):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        stopWords = [
            "a",
            "an",
            "and",
            "are",
            "as",
            "at",
            "be",
            "but",
            "by",
            "for",
            "if",
            "no",
            "not",
            "of",
            "on",
            "or",
            "such",
            "that",
            "the",
            "their",
            "then",
            "there",
            "these",
            "they",
            "this",
            "to",
            "was",
            "will",
            "with",
            "el",
            "la",
            "lo",
            "los",
            "las",
            "ante",
            "con",
            "sin",
            "que",
            "es",
            "de",
            "en",
            "por",
            "y",
            "los",
        ]
        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))
        conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
        if rebuild:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        else:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)

        iwriter = IndexWriter(directory, conf)

        for key in texts:
            doc = Document()
            doc.add(Field("docName", key.__str__(), Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(Field("content", texts[key], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES))
            iwriter.addDocument(doc)

        iwriter.close()
Example #16
0
    def setUp(self):

        stopWords = ['the', 'and', 's']
        self.stop_set = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                               stopWords)
        self.reader = StringReader('foo')
Example #17
0
    def getMostFrequentTermStopwords(route, query):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = [
            "a",
            "an",
            "and",
            "are",
            "as",
            "at",
            "be",
            "but",
            "by",
            "for",
            "if",
            "no",
            "not",
            "more",
            "http",
            "html",
            "of",
            "on",
            "or",
            "such",
            "that",
            "the",
            "their",
            "then",
            "there",
            "these",
            "they",
            "this",
            "to",
            "was",
            "will",
            "with",
            "el",
            "la",
            "lo",
            "los",
            "las",
            "ante",
            "con",
            "sin",
            "que",
            "es",
            "de",
            "en",
            "por",
            "y",
            "los",
        ]

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)

        currentTerm = ""
        currentTermFreq = 0
        for doc in range(ireader.numDocs()):
            terms = ireader.getTermVector(doc, "content")
            if terms is not None:
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString().encode("UTF-8")
                    t = Term("content", term)
                    freq = ireader.totalTermFreq(t)
                    if freq > currentTermFreq and text not in query:
                        currentTerm = text
                        currentTermFreq = freq

        return currentTerm
    def setUp(self):

        stopWords = ['the', 'and', 's']
        self.stop_set = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                               stopWords)
        self.reader = StringReader('foo')