Example #1
0
def evaluate_index(index_dir, context, analyzer):
    # eval time of indexing (overall)
    # we should also measure the elapsed time of
    # each index_document call seperately
    start = time.clock()
    Indexer(index_dir, context, analyzer)
    end = time.clock()
    duration = end-start

    directory = SimpleFSDirectory(File(index_dir))
    reader = IndexReader.open(directory)
    vocabulary = MultiFields.getTerms(reader, 'title')
    vocab_size = vocabulary.size()
    if vocab_size == -1:
        termsref = BytesRefIterator.cast_(vocabulary.iterator(None))
        vocab_size = sum(1 for _ in termsref)


    # print str(vocab_size) # size of vocabulary
    # print str(vocabulary.getDocCount()) # #docs that have at least one term for title field
    # print str(vocabulary.getSumTotalTermFreq()) # #tokens
    # print str(vocabulary.getSumDocFreq()) # #postings

    reader.close()
    return duration, vocab_size
 def get_doc_termvector(self, lucene_doc_id, field):
     """Outputs the document term vector as a generator."""
     terms = self.reader.getTermVector(lucene_doc_id, field)
     if terms:
         termenum = terms.iterator(None)
         for bytesref in BytesRefIterator.cast_(termenum):
             yield bytesref.utf8ToString(), termenum
Example #3
0
    def getTermVectors(route):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = []
        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)
        ls = []
        for doc in range(ireader.numDocs()):
            vector = FreqVector()
            vector.vector = []
            vector.freqs = []

            norm = 0.0
            terms = ireader.getTermVector(doc, "content")
            if terms is not None:
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString()
                    tf = 1 + math.log(termsEnum.totalTermFreq(), 2)
                    t = Term("content", term)
                    idf = math.log(ireader.numDocs() / ireader.docFreq(t))
                    vector.vector.append(text)
                    vector.freqs.append(tf * idf)
                    norm += (tf * idf) * (tf * idf)
                ls.append((vector, math.sqrt(norm)))
            else:
                ls.append((vector, 0))
        return ls
Example #4
0
    def getMostFrequentTermNoStopwords(route, query):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = []

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)

        currentTerm = ""
        currentTermFreq = 0
        for doc in range(ireader.numDocs()):
            terms = ireader.getTermVector(doc, "content")
            if terms is not None:
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString()
                    t = Term("content", term)
                    freq = ireader.totalTermFreq(t)
                    if freq > currentTermFreq and text not in query:
                        currentTerm = text
                        currentTermFreq = freq
        return currentTerm
Example #5
0
File: tirza.py Project: komax/tirza
def read_vectors():
    for doc in range(0, reader.numDocs()):
        for fieldName in FIELD_NAMES:
            terms = reader.getTermVector(doc, fieldName)
            if terms:
                termsEnum = terms.iterator(None)
                vectors[fieldName][doc] = \
                    set(term.utf8ToString() for term in BytesRefIterator.cast_(termsEnum))
 def get_coll_termvector(self, field):
     """ Returns collection term vector for the given field."""
     self.open_reader()
     fields = MultiFields.getFields(self.reader)
     if fields is not None:
         terms = fields.terms(field)
         if terms:
             termenum = terms.iterator(None)
             for bytesref in BytesRefIterator.cast_(termenum):
                 yield bytesref.utf8ToString(), termenum
Example #7
0
    def test_bug1842(self):

        reader = self.getReader()
        searcher = self.getSearcher()
        q = TermQuery(Term("id", '1'))
        topDocs = searcher.search(q, 50)

        termvec = reader.getTermVector(topDocs.scoreDocs[0].doc, "all")
        terms = []
        freqs = []
        termsEnum = termvec.iterator()
        for term in BytesRefIterator.cast_(termsEnum):
            terms.append(term.utf8ToString())
            freqs.append(termsEnum.totalTermFreq())
        terms.sort()
        self.assert_(terms == ['blah', 'gesundheit'])
        self.assert_(freqs == [3, 1])
Example #8
0
    def test_FieldEnumeration(self):
        self.test_indexDocument()

        store = self.openStore()
        writer = None
        try:
            analyzer = self.getAnalyzer()

            writer = self.getWriter(store, analyzer, False)
            doc = Document()
            doc.add(Field("title", "value of testing",
                          TextField.TYPE_STORED))
            doc.add(Field("docid", str(2),
                          StringField.TYPE_NOT_STORED))
            doc.add(Field("owner", "unittester",
                          StringField.TYPE_STORED))
            doc.add(Field("search_name", "wisdom",
                          StoredField.TYPE))
            doc.add(Field("meta_words", "rabbits are beautiful",
                          TextField.TYPE_NOT_STORED))

            writer.addDocument(doc)

            doc = Document()
            doc.add(Field("owner", "unittester",
                          StringField.TYPE_NOT_STORED))
            doc.add(Field("search_name", "wisdom",
                          StoredField.TYPE))
            doc.add(Field("meta_words", "rabbits are beautiful",
                          TextField.TYPE_NOT_STORED))
            writer.addDocument(doc)
        finally:
            self.closeStore(store, writer)

        store = self.openStore()
        reader = None
        try:
            reader = DirectoryReader.open(store)
            term_enum = MultiFields.getTerms(reader, "docid").iterator()
            docids = [term.utf8ToString()
                      for term in BytesRefIterator.cast_(term_enum)]
            self.assertEqual(len(docids), 2)
        finally:
            self.closeStore(store, reader)
Example #9
0
    def getFreqVectorFromText(self, text):
        # Initialization of Java Virtual Machine with Lucene
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        indexDir = "res/index"

        stopWords = []

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords)

        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(indexDir))
        conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
        rebuild = True
        if rebuild:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        else:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)

        iwriter = IndexWriter(directory, conf)

        doc = Document()
        doc.add(Field("docName", 'url', Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("content", text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES))
        iwriter.addDocument(doc)
        iwriter.close()

        ireader = IndexReader.open(directory)

        freqVector = []
        docVector = ireader.getTermVector(0, "content")

        termsEnum = docVector.iterator(None)
        for term in BytesRefIterator.cast_(termsEnum):
            text = term.utf8ToString()
            freq = termsEnum.totalTermFreq()
            freqVector.append((text, freq))

        freqVector = sorted(freqVector, key=itemgetter(1), reverse=True)
        self.vector = list()
        self.freqs = list()
        for el in freqVector:
            self.vector.append(el[0])
            self.freqs.append(el[1])
    def get_doc_termfreqs_all_fields(self, lucene_doc_id):
        """
        Returns term frequency for all fields in the given document.

        :param lucene_doc_id: Lucene document ID
        :return: dictionary {field: {term: freq, ...}, ...}
        """
        doc_termfreqs = {}
        vectors = self.reader.getTermVectors(lucene_doc_id)
        if vectors:
            for field in vectors.iterator():
                doc_termfreqs[field] = {}
                terms = vectors.terms(field)
                if terms:
                    termenum = terms.iterator(None)
                    for bytesref in BytesRefIterator.cast_(termenum):
                        doc_termfreqs[field][bytesref.utf8ToString()] = int(termenum.totalTermFreq())
                    print doc_termfreqs[field]
        return doc_termfreqs
Example #11
0
 def termsForField(self, field, prefix=None, limit=10, **kwargs):
     convert = lambda term: term.utf8ToString()
     terms = []
     termsEnum = MultiFields.getTerms(self._indexAndTaxonomy.searcher.getIndexReader(), field)
     if termsEnum is None:
         return terms
     iterator = termsEnum.iterator(None)
     if prefix:
         iterator.seekCeil(BytesRef(prefix))
         terms.append((iterator.docFreq(), convert(iterator.term())))
     bytesIterator = BytesRefIterator.cast_(iterator)
     try:
         while len(terms) < limit:
             term = convert(bytesIterator.next())
             if prefix and not term.startswith(prefix):
                 break
             terms.append((iterator.docFreq(), term))
     except StopIteration:
         pass
     return terms
Example #12
0
def evaluate_index(data_dir, store_dir, analyzer):
    """
    Evaluates vocabulary size and indexing speed for different
    analyzer configurations.
    """
    start = time.clock()
    Indexer(data_dir, store_dir, analyzer)
    end = time.clock()
    duration = end-start

    directory = SimpleFSDirectory(File(store_dir))
    reader = IndexReader.open(directory)
    vocabulary = MultiFields.getTerms(reader, 'title')
    vocab_size = vocabulary.size()

    # sometimes .size() doesn't return the correct size, in this case
    # we have to count manually
    if vocab_size == -1:
        termsref = BytesRefIterator.cast_(vocabulary.iterator(None))
        vocab_size = sum(1 for _ in termsref)

    reader.close()
    return duration, vocab_size
Example #13
0
ft.setStoreTermVectorPositions(True)
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

ts = ["this bernhard is the text to be index text",
      "this claudia is the text to be indexed"]
for t in ts:
    doc = Document()
    doc.add(Field("fieldname", t, ft))
    iwriter.addDocument(doc)

iwriter.commit()
iwriter.close()
ireader = DirectoryReader.open(directory)

for doc in xrange(0, len(ts)):
    tv = ireader.getTermVector(doc, "fieldname")
    termsEnum = tv.iterator()

    for term in BytesRefIterator.cast_(termsEnum):
        dpEnum = termsEnum.postings(None)
        dpEnum.nextDoc()  # prime the enum which works only for the current doc
        freq = dpEnum.freq()

        print 'term:', term.utf8ToString()
        print '  freq:', freq

        for i in xrange(freq):
            print "  pos:", dpEnum.nextPosition()
            print "  off: %i-%i" %(dpEnum.startOffset(), dpEnum.endOffset())
    print
Example #14
0
    def search_query_with_relevance_feedback(self,
                                             query,
                                             feedback_qrels,
                                             num_returns=50,
                                             add_num=1):
        query_text = query["description"]
        print(query_text)
        query_text = " ".join(tokenizer.tokenize(query_text))
        query_text = self.remove_stopwords(query_text.lower())
        print(query_text)
        query_number = query["Number"]
        qrel_doc_ids = [
            qrel["docno"] for qrel in feedback_qrels
            if qrel["qid"] == query_number
        ]
        final_list = []
        term_tf_idf = {}
        doc_count = len(qrel_doc_ids)
        for qrel_doc_id in qrel_doc_ids:
            initial_hit = self.feedback_searcher.search(
                TermQuery(Term(".U", qrel_doc_id)), 1).scoreDocs
            if len(initial_hit) == 0:
                continue
            assert len(initial_hit) == 1
            termVector = self.reader.getTermVector(initial_hit[0].doc, "text")
            terms_enum = termVector.iterator()
            termsref = BytesRefIterator.cast_(terms_enum)
            N_terms = 0
            term_idf = {}
            term_freq = {}
            term_list = []
            while (termsref.next()):
                termval = TermsEnum.cast_(termsref)
                termText = termval.term().utf8ToString()
                if termText in self.stopwords:
                    continue
                tc = termval.totalTermFreq()
                if termText in term_freq:
                    term_freq[termText] += tc
                else:
                    term_freq[termText] = tc
                if termText in term_idf:
                    term_idf[termText] += 1
                else:
                    term_idf[termText] = 1
                if termText not in term_list:
                    term_list.append(termText)
                N_terms = N_terms + 1

            for term in term_list:
                if term in term_tf_idf:
                    term_tf_idf[term] += term_freq[term] / N_terms * (
                        1 + math.log(doc_count / (term_idf[term] + 1)))
                else:
                    term_tf_idf[term] = term_freq[term] / N_terms * (
                        1 + math.log(doc_count / (term_idf[term] + 1)))

        sorted_tf_idf = sorted(term_tf_idf.items(),
                               key=lambda x: x[1],
                               reverse=True)
        for each in sorted_tf_idf:
            if each[0] not in self.stopwords and not str(each[0]).isnumeric(
            ) and each[0] not in query_text.split(" "):
                final_list.append(each[0])
        print(final_list[:add_num])
        query_text = query_text + " " + " ".join(final_list[:add_num])
        query_text = " ".join(query_text.split(" "))
        print(query_text)
        query_search = self.parser.parse(query_text)
        results = self.searcher.search(query_search, num_returns)
        hits = results.scoreDocs
        trec_results = []
        for rank, hit in enumerate(hits):
            doc = self.searcher.doc(hit.doc)
            trec_result = {
                "QueryID": query["Number"],
                "Q0": "Q0",
                "DocID": doc.get(".U"),
                "Rank": str(rank + 1),
                "Score": str(hit.score),
                "RunID": self.similarity
            }
            trec_results.append(trec_result)
        return trec_results
ts = [
    "this bernhard is the text to be index text",
    "this claudia is the text to be indexed"
]
for t in ts:
    doc = Document()
    doc.add(Field("fieldname", t, ft))
    iwriter.addDocument(doc)

iwriter.commit()
iwriter.close()
ireader = DirectoryReader.open(directory)

for doc in xrange(0, len(ts)):
    tv = ireader.getTermVector(doc, "fieldname")
    termsEnum = tv.iterator(None)

    for term in BytesRefIterator.cast_(termsEnum):
        dpEnum = termsEnum.docsAndPositions(None, None)
        dpEnum.nextDoc()  # prime the enum which works only for the current doc
        freq = dpEnum.freq()

        print 'term:', term.utf8ToString()
        print '  freq:', freq

        for i in xrange(freq):
            print "  pos:", dpEnum.nextPosition()
            print "  off: %i-%i" % (dpEnum.startOffset(), dpEnum.endOffset())
    print
Example #16
0
    def getMostFrequentTermStopwords(route, query):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = [
            "a",
            "an",
            "and",
            "are",
            "as",
            "at",
            "be",
            "but",
            "by",
            "for",
            "if",
            "no",
            "not",
            "more",
            "http",
            "html",
            "of",
            "on",
            "or",
            "such",
            "that",
            "the",
            "their",
            "then",
            "there",
            "these",
            "they",
            "this",
            "to",
            "was",
            "will",
            "with",
            "el",
            "la",
            "lo",
            "los",
            "las",
            "ante",
            "con",
            "sin",
            "que",
            "es",
            "de",
            "en",
            "por",
            "y",
            "los",
        ]

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)

        currentTerm = ""
        currentTermFreq = 0
        for doc in range(ireader.numDocs()):
            terms = ireader.getTermVector(doc, "content")
            if terms is not None:
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString().encode("UTF-8")
                    t = Term("content", term)
                    freq = ireader.totalTermFreq(t)
                    if freq > currentTermFreq and text not in query:
                        currentTerm = text
                        currentTermFreq = freq

        return currentTerm