Beispiel #1
0
    def getTermVectors(route):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = []
        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)
        ls = []
        for doc in range(ireader.numDocs()):
            vector = FreqVector()
            vector.vector = []
            vector.freqs = []

            norm = 0.0
            terms = ireader.getTermVector(doc, "content")
            if (terms is not None):
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString()
                    tf = 1 + math.log(termsEnum.totalTermFreq(), 2)
                    t = Term("content", term)
                    idf = math.log(ireader.numDocs() / ireader.docFreq(t))
                    vector.vector.append(text)
                    vector.freqs.append(tf * idf)
                    norm += (tf * idf) * (tf * idf)
                ls.append((vector, math.sqrt(norm)))
            else:
                ls.append((vector, 0))
        return ls
Beispiel #2
0
    def getMostFrequentTermNoStopwords(route, query):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = []

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)

        currentTerm = ""
        currentTermFreq = 0
        for doc in range(ireader.numDocs()):
            terms = ireader.getTermVector(doc, "content")
            if terms is not None:
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString()
                    t = Term("content", term)
                    freq = ireader.totalTermFreq(t)
                    if freq > currentTermFreq and text not in query:
                        currentTerm = text
                        currentTermFreq = freq
        return currentTerm
Beispiel #3
0
    def getMostFrequentTermStopwords(route, query):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = [
            "a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
            "if", "no", "not", "more", "http", "html", "of", "on", "or",
            "such", "that", "the", "their", "then", "there", "these", "they",
            "this", "to", "was", "will", "with", "el", "la", "lo", "los",
            "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y",
            "los"
        ]

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)

        currentTerm = ""
        currentTermFreq = 0
        for doc in range(ireader.numDocs()):
            terms = ireader.getTermVector(doc, "content")
            if terms is not None:
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString().encode('UTF-8')
                    t = Term("content", term)
                    freq = ireader.totalTermFreq(t)
                    if freq > currentTermFreq and text not in query:
                        currentTerm = text
                        currentTermFreq = freq

        return currentTerm
Beispiel #4
0
    def testStop(self):

        a = StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)
        self._assertAnalyzesTo(a, "foo bar FOO BAR",
                               ["foo", "bar", "foo", "bar"])
        self._assertAnalyzesTo(a, "foo a bar such FOO THESE BAR",
                               ["foo", "bar", "foo", "bar"])
Beispiel #5
0
    def createIndexStopwords(texts, route, rebuild):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        stopWords = [
            "a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
            "if", "no", "not", "of", "on", "or", "such", "that", "the",
            "their", "then", "there", "these", "they", "this", "to", "was",
            "will", "with", "el", "la", "lo", "los", "las", "ante", "con",
            "sin", "que", "es", "de", "en", "por", "y", "los"
        ]
        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))
        conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
        if rebuild:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        else:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)

        iwriter = IndexWriter(directory, conf)

        for key in texts:
            doc = Document()
            doc.add(
                Field("docName", key.__str__(), Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(
                Field("content", texts[key], Field.Store.YES,
                      Field.Index.ANALYZED, Field.TermVector.YES))
            iwriter.addDocument(doc)

        iwriter.close()
Beispiel #6
0
    def testStopList(self):

        stopWords = ["good", "test", "analyzer"]
        stopWordsSet = StopFilter.makeStopSet(stopWords)

        newStop = StopAnalyzer(stopWordsSet)
        reader = StringReader("This is a good test of the english stop analyzer")
        stream = newStop.tokenStream("test", reader)
        self.assert_(stream is not None)
        stream.reset()

        termAtt = stream.getAttribute(CharTermAttribute.class_)

        while stream.incrementToken():
            text = termAtt.toString()
            self.assert_(text not in stopWordsSet)
Beispiel #7
0
    def testStop(self):

        a = StopAnalyzer()
        self._assertAnalyzesTo(a, "foo bar FOO BAR",
                               ["foo", "bar", "foo", "bar"])
        self._assertAnalyzesTo(a, "foo a bar such FOO THESE BAR",
                               ["foo", "bar", "foo", "bar"])
Beispiel #8
0
    def testStop(self):

        a = StopAnalyzer(Version.LUCENE_CURRENT)
        self._assertAnalyzesTo(a, "foo bar FOO BAR",
                               ["foo", "bar", "foo", "bar"])
        self._assertAnalyzesTo(a, "foo a bar such FOO THESE BAR",
                               ["foo", "bar", "foo", "bar"])
Beispiel #9
0
    def createIndexNoStopwords(texts, route, rebuild):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        stopWords = []

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))
        conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
        if rebuild:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        else:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)

        iwriter = IndexWriter(directory, conf)

        for key in texts:
            doc = Document()
            doc.add(
                Field("docName", "doc", Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(
                Field("content", texts[key], Field.Store.YES,
                      Field.Index.ANALYZED, Field.TermVector.YES))
            iwriter.addDocument(doc)

        iwriter.close()
 def __init__(self):
     # self.analyzer = StandardAnalyzer()
     # self.analyzer = PersianAnalyzer(StopFilter.makeStopSet(sw))
     # self.analyzer = PersianAnalyzer()
     self.analyzer = StopAnalyzer(Paths.get(Config.stop_words_address))
     self.config = IndexWriterConfig(self.analyzer)
     self.index = RAMDirectory()
     self.w = IndexWriter(self.index, self.config)
Beispiel #11
0
    def testStopListPositions(self):

        stopWords = ["good", "test", "analyzer"]
        stopWordsSet = StopFilter.makeStopSet(stopWords)

        newStop = StopAnalyzer(stopWordsSet)
        reader = StringReader("This is a good test of the english stop analyzer with positions")
        expectedIncr = [ 1,   1, 1,          3, 1,  1,      1,            2,   1]
        stream = newStop.tokenStream("test", reader)
        self.assert_(stream is not None)
        stream.reset()

        i = 0
        termAtt = stream.getAttribute(CharTermAttribute.class_)
        posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class_)

        while stream.incrementToken():
            text = termAtt.toString()
            self.assert_(text not in stopWordsSet)
            self.assertEqual(expectedIncr[i],
                             posIncrAtt.getPositionIncrement())
            i += 1
Beispiel #12
0
    def testPhraseQueryWithStopAnalyzer(self):

        writer = self.getWriter(analyzer=StopAnalyzer(Version.LUCENE_CURRENT))
        doc = Document()
        doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED))
        writer.addDocument(doc)
        writer.close()

        searcher = self.getSearcher()

        # valid exact phrase query
        query = PhraseQuery()
        query.add(Term("field", "stop"))
        query.add(Term("field", "words"))
        scoreDocs = searcher.search(query, None, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))
Beispiel #13
0
    def testPhraseQueryWithStopAnalyzer(self):

        writer = self.getWriter(analyzer=StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET))
        doc = Document()
        doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED))
        writer.addDocument(doc)
        writer.close()

        searcher = self.getSearcher()

        # valid exact phrase query
        builder = PhraseQuery.Builder()
        builder.add(Term("field", "stop"))
        builder.add(Term("field", "words"))
        scoreDocs = searcher.search(builder.build(), 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))
Beispiel #14
0
    def getFreqVectorFromText(self, text):
        # Initialization of Java Virtual Machine with Lucene
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        indexDir = "res/index"

        stopWords = []

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords)

        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(indexDir))
        conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
        rebuild = True
        if rebuild:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        else:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)

        iwriter = IndexWriter(directory, conf)

        doc = Document()
        doc.add(Field("docName", 'url', Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("content", text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES))
        iwriter.addDocument(doc)
        iwriter.close()

        ireader = IndexReader.open(directory)

        freqVector = []
        docVector = ireader.getTermVector(0, "content")

        termsEnum = docVector.iterator(None)
        for term in BytesRefIterator.cast_(termsEnum):
            text = term.utf8ToString()
            freq = termsEnum.totalTermFreq()
            freqVector.append((text, freq))

        freqVector = sorted(freqVector, key=itemgetter(1), reverse=True)
        self.vector = list()
        self.freqs = list()
        for el in freqVector:
            self.vector.append(el[0])
            self.freqs.append(el[1])
Beispiel #15
0
class StopAnalyzerTestCase(unittest.TestCase):
    """
    Unit tests ported from Java Lucene
    """

    def setUp(self):

        self.stop = StopAnalyzer()
        self.invalidTokens = StopAnalyzer.ENGLISH_STOP_WORDS_SET

    def testDefaults(self):

        self.assert_(self.stop is not None)
        reader = StringReader("This is a test of the english stop analyzer")
        stream = self.stop.tokenStream("test", reader)
        self.assert_(stream is not None)
        stream.reset()

        termAtt = stream.getAttribute(CharTermAttribute.class_)

        while stream.incrementToken():
            self.assert_(termAtt.toString() not in self.invalidTokens)

    def testStopList(self):

        stopWords = ["good", "test", "analyzer"]
        stopWordsSet = StopFilter.makeStopSet(stopWords)

        newStop = StopAnalyzer(stopWordsSet)
        reader = StringReader("This is a good test of the english stop analyzer")
        stream = newStop.tokenStream("test", reader)
        self.assert_(stream is not None)
        stream.reset()

        termAtt = stream.getAttribute(CharTermAttribute.class_)

        while stream.incrementToken():
            text = termAtt.toString()
            self.assert_(text not in stopWordsSet)

    def testStopListPositions(self):

        stopWords = ["good", "test", "analyzer"]
        stopWordsSet = StopFilter.makeStopSet(stopWords)

        newStop = StopAnalyzer(stopWordsSet)
        reader = StringReader("This is a good test of the english stop analyzer with positions")
        expectedIncr = [ 1,   1, 1,          3, 1,  1,      1,            2,   1]
        stream = newStop.tokenStream("test", reader)
        self.assert_(stream is not None)
        stream.reset()

        i = 0
        termAtt = stream.getAttribute(CharTermAttribute.class_)
        posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class_)

        while stream.incrementToken():
            text = termAtt.toString()
            self.assert_(text not in stopWordsSet)
            self.assertEqual(expectedIncr[i],
                             posIncrAtt.getPositionIncrement())
            i += 1
Beispiel #16
0
    def setUp(self):

        self.stop = StopAnalyzer(Version.LUCENE_CURRENT)
        self.invalidTokens = StopAnalyzer.ENGLISH_STOP_WORDS_SET
Beispiel #17
0
    def setUp(self):

        self.stop = StopAnalyzer()
        self.invalidTokens = StopAnalyzer.ENGLISH_STOP_WORDS_SET
Beispiel #18
0
def parseQuery(myQuery):
    parser = QueryParser("", StopAnalyzer())
    parsedQuery = parser.parse(myQuery)
    myQueryTerms = parsedQuery.toString().split(" ")
    return myQueryTerms