Beispiel #1
0
def evaluate_index(index_dir, context, analyzer):
    # eval time of indexing (overall)
    # we should also measure the elapsed time of
    # each index_document call seperately
    start = time.clock()
    Indexer(index_dir, context, analyzer)
    end = time.clock()
    duration = end-start

    directory = SimpleFSDirectory(File(index_dir))
    reader = IndexReader.open(directory)
    vocabulary = MultiFields.getTerms(reader, 'title')
    vocab_size = vocabulary.size()
    if vocab_size == -1:
        termsref = BytesRefIterator.cast_(vocabulary.iterator(None))
        vocab_size = sum(1 for _ in termsref)


    # print str(vocab_size) # size of vocabulary
    # print str(vocabulary.getDocCount()) # #docs that have at least one term for title field
    # print str(vocabulary.getSumTotalTermFreq()) # #tokens
    # print str(vocabulary.getSumDocFreq()) # #postings

    reader.close()
    return duration, vocab_size
Beispiel #2
0
def index_scan():
    print("Scanning the index")
    #pdb.set_trace()
    indexPath = File("indexOut/").toPath()
    indexDir = FSDirectory.open(indexPath)
    reader = DirectoryReader.open(indexDir)
    fields = MultiFields.getFields(reader)
    for field in fields:
        term = MultiFields.getTerms(reader,field)
        print(field, "->" , term)
Beispiel #3
0
def get_terms(indexReader, field='text'):
    """
    Gets all terms in an index.

    :param indexReader: IndexReader object of your index
    :param field: document field from which terms should be counted
    :return: list of terms (strings)
    """
    terms = []
    multiterms = MultiFields.getTerms(indexReader, field)
    termit = multiterms.iterator()
    it = BytesRefIterator.cast_(
        termit)  # Inheritance apparently doesn't work in PyLucene...
    term = it.next()
    while term:
        terms.append(term.utf8ToString())
        term = it.next()
    return terms
Beispiel #4
0
    def test_FieldEnumeration(self):
        self.test_indexDocument()

        store = self.openStore()
        writer = None
        try:
            analyzer = self.getAnalyzer()

            writer = self.getWriter(store, analyzer, False)
            doc = Document()
            doc.add(Field("title", "value of testing",
                          TextField.TYPE_STORED))
            doc.add(Field("docid", str(2),
                          StringField.TYPE_NOT_STORED))
            doc.add(Field("owner", "unittester",
                          StringField.TYPE_STORED))
            doc.add(Field("search_name", "wisdom",
                          StoredField.TYPE))
            doc.add(Field("meta_words", "rabbits are beautiful",
                          TextField.TYPE_NOT_STORED))

            writer.addDocument(doc)

            doc = Document()
            doc.add(Field("owner", "unittester",
                          StringField.TYPE_NOT_STORED))
            doc.add(Field("search_name", "wisdom",
                          StoredField.TYPE))
            doc.add(Field("meta_words", "rabbits are beautiful",
                          TextField.TYPE_NOT_STORED))
            writer.addDocument(doc)
        finally:
            self.closeStore(store, writer)

        store = self.openStore()
        reader = None
        try:
            reader = DirectoryReader.open(store)
            term_enum = MultiFields.getTerms(reader, "docid").iterator()
            docids = [term.utf8ToString()
                      for term in BytesRefIterator.cast_(term_enum)]
            self.assertEqual(len(docids), 2)
        finally:
            self.closeStore(store, reader)
Beispiel #5
0
    def test_FieldEnumeration(self):
        self.test_indexDocument()

        store = self.openStore()
        writer = None
        try:
            analyzer = self.getAnalyzer()

            writer = self.getWriter(store, analyzer, False)
            doc = Document()
            doc.add(Field("title", "value of testing", TextField.TYPE_STORED))
            doc.add(Field("docid", str(2), StringField.TYPE_NOT_STORED))
            doc.add(Field("owner", "unittester", StringField.TYPE_STORED))
            doc.add(Field("search_name", "wisdom", StoredField.TYPE))
            doc.add(
                Field("meta_words", "rabbits are beautiful",
                      TextField.TYPE_NOT_STORED))

            writer.addDocument(doc)

            doc = Document()
            doc.add(Field("owner", "unittester", StringField.TYPE_NOT_STORED))
            doc.add(Field("search_name", "wisdom", StoredField.TYPE))
            doc.add(
                Field("meta_words", "rabbits are beautiful",
                      TextField.TYPE_NOT_STORED))
            writer.addDocument(doc)
        finally:
            self.closeStore(store, writer)

        store = self.openStore()
        reader = None
        try:
            reader = DirectoryReader.open(store)
            term_enum = MultiFields.getTerms(reader, "docid").iterator()
            docids = [
                term.utf8ToString()
                for term in BytesRefIterator.cast_(term_enum)
            ]
            self.assertEqual(len(docids), 2)
        finally:
            self.closeStore(store, reader)
Beispiel #6
0
 def termsForField(self, field, prefix=None, limit=10, **kwargs):
     convert = lambda term: term.utf8ToString()
     terms = []
     termsEnum = MultiFields.getTerms(self._indexAndTaxonomy.searcher.getIndexReader(), field)
     if termsEnum is None:
         return terms
     iterator = termsEnum.iterator(None)
     if prefix:
         iterator.seekCeil(BytesRef(prefix))
         terms.append((iterator.docFreq(), convert(iterator.term())))
     bytesIterator = BytesRefIterator.cast_(iterator)
     try:
         while len(terms) < limit:
             term = convert(bytesIterator.next())
             if prefix and not term.startswith(prefix):
                 break
             terms.append((iterator.docFreq(), term))
     except StopIteration:
         pass
     return terms
    def get_dicts(self, reader, tweets, num_docs):
        """ investigate index by constructing term dict (term,id) and idf dict (term,idf_val). """

        terms_dict = {}
        idf_dict = {}
        terms_ctr = 0

        # iterate over each term in index
        term_enum = MultiFields.getTerms(reader, "contents").iterator(None)
        for bytes_ref in util.BytesRefIterator.cast_(term_enum):
            s = bytes_ref.utf8ToString()
            terms_dict[s] = terms_ctr
            terms_ctr += 1

            # count occurences of this term in the index and calculate idf
            doc_presence_ctr = 0
            for tweet in tweets:
                if s in tweet[1].text.lower():
                    doc_presence_ctr += 1

            idf_dict[s] = log(float(num_docs) / doc_presence_ctr, 10)

        return terms_dict, idf_dict
Beispiel #8
0
def evaluate_index(data_dir, store_dir, analyzer):
    """
    Evaluates vocabulary size and indexing speed for different
    analyzer configurations.
    """
    start = time.clock()
    Indexer(data_dir, store_dir, analyzer)
    end = time.clock()
    duration = end-start

    directory = SimpleFSDirectory(File(store_dir))
    reader = IndexReader.open(directory)
    vocabulary = MultiFields.getTerms(reader, 'title')
    vocab_size = vocabulary.size()

    # sometimes .size() doesn't return the correct size, in this case
    # we have to count manually
    if vocab_size == -1:
        termsref = BytesRefIterator.cast_(vocabulary.iterator(None))
        vocab_size = sum(1 for _ in termsref)

    reader.close()
    return duration, vocab_size
Beispiel #9
0
from org.apache.lucene.index import MultiFields
from org.apache.lucene.util import BytesRef, BytesRefIterator
import timeit

if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print('lucene', lucene.VERSION)
    start_time = timeit.default_timer()
    try:
        index = IndexFiles(os.path.dirname(sys.argv[0]), StandardAnalyzer())

        index_reader = DirectoryReader.open(index.store)
        # get vocab size

        terms = MultiFields.getTerms(index_reader, 'contents')
        termEnum = terms.iterator()
        vocabCounter = 0
        for term in BytesRefIterator.cast_(termEnum):
            vocabCounter += 1
        print("Number of docs:", index_reader.numDocs())
        print("Vocab size:", vocabCounter)

        # print min, max, mean
        querystr = 'بازار بزرگ تهران'
        print("Query: ", querystr)
        q = QueryParser("contents", index.analyzer).parse(querystr)
        hitsPerPage = 20
        searcher = IndexSearcher(index_reader)
        docs = searcher.search(q, hitsPerPage)
        hits = docs.scoreDocs