def evaluate_index(index_dir, context, analyzer): # eval time of indexing (overall) # we should also measure the elapsed time of # each index_document call seperately start = time.clock() Indexer(index_dir, context, analyzer) end = time.clock() duration = end-start directory = SimpleFSDirectory(File(index_dir)) reader = IndexReader.open(directory) vocabulary = MultiFields.getTerms(reader, 'title') vocab_size = vocabulary.size() if vocab_size == -1: termsref = BytesRefIterator.cast_(vocabulary.iterator(None)) vocab_size = sum(1 for _ in termsref) # print str(vocab_size) # size of vocabulary # print str(vocabulary.getDocCount()) # #docs that have at least one term for title field # print str(vocabulary.getSumTotalTermFreq()) # #tokens # print str(vocabulary.getSumDocFreq()) # #postings reader.close() return duration, vocab_size
def index_scan(): print("Scanning the index") #pdb.set_trace() indexPath = File("indexOut/").toPath() indexDir = FSDirectory.open(indexPath) reader = DirectoryReader.open(indexDir) fields = MultiFields.getFields(reader) for field in fields: term = MultiFields.getTerms(reader,field) print(field, "->" , term)
def get_terms(indexReader, field='text'): """ Gets all terms in an index. :param indexReader: IndexReader object of your index :param field: document field from which terms should be counted :return: list of terms (strings) """ terms = [] multiterms = MultiFields.getTerms(indexReader, field) termit = multiterms.iterator() it = BytesRefIterator.cast_( termit) # Inheritance apparently doesn't work in PyLucene... term = it.next() while term: terms.append(term.utf8ToString()) term = it.next() return terms
def test_FieldEnumeration(self): self.test_indexDocument() store = self.openStore() writer = None try: analyzer = self.getAnalyzer() writer = self.getWriter(store, analyzer, False) doc = Document() doc.add(Field("title", "value of testing", TextField.TYPE_STORED)) doc.add(Field("docid", str(2), StringField.TYPE_NOT_STORED)) doc.add(Field("owner", "unittester", StringField.TYPE_STORED)) doc.add(Field("search_name", "wisdom", StoredField.TYPE)) doc.add(Field("meta_words", "rabbits are beautiful", TextField.TYPE_NOT_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("owner", "unittester", StringField.TYPE_NOT_STORED)) doc.add(Field("search_name", "wisdom", StoredField.TYPE)) doc.add(Field("meta_words", "rabbits are beautiful", TextField.TYPE_NOT_STORED)) writer.addDocument(doc) finally: self.closeStore(store, writer) store = self.openStore() reader = None try: reader = DirectoryReader.open(store) term_enum = MultiFields.getTerms(reader, "docid").iterator() docids = [term.utf8ToString() for term in BytesRefIterator.cast_(term_enum)] self.assertEqual(len(docids), 2) finally: self.closeStore(store, reader)
def test_FieldEnumeration(self): self.test_indexDocument() store = self.openStore() writer = None try: analyzer = self.getAnalyzer() writer = self.getWriter(store, analyzer, False) doc = Document() doc.add(Field("title", "value of testing", TextField.TYPE_STORED)) doc.add(Field("docid", str(2), StringField.TYPE_NOT_STORED)) doc.add(Field("owner", "unittester", StringField.TYPE_STORED)) doc.add(Field("search_name", "wisdom", StoredField.TYPE)) doc.add( Field("meta_words", "rabbits are beautiful", TextField.TYPE_NOT_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("owner", "unittester", StringField.TYPE_NOT_STORED)) doc.add(Field("search_name", "wisdom", StoredField.TYPE)) doc.add( Field("meta_words", "rabbits are beautiful", TextField.TYPE_NOT_STORED)) writer.addDocument(doc) finally: self.closeStore(store, writer) store = self.openStore() reader = None try: reader = DirectoryReader.open(store) term_enum = MultiFields.getTerms(reader, "docid").iterator() docids = [ term.utf8ToString() for term in BytesRefIterator.cast_(term_enum) ] self.assertEqual(len(docids), 2) finally: self.closeStore(store, reader)
def termsForField(self, field, prefix=None, limit=10, **kwargs): convert = lambda term: term.utf8ToString() terms = [] termsEnum = MultiFields.getTerms(self._indexAndTaxonomy.searcher.getIndexReader(), field) if termsEnum is None: return terms iterator = termsEnum.iterator(None) if prefix: iterator.seekCeil(BytesRef(prefix)) terms.append((iterator.docFreq(), convert(iterator.term()))) bytesIterator = BytesRefIterator.cast_(iterator) try: while len(terms) < limit: term = convert(bytesIterator.next()) if prefix and not term.startswith(prefix): break terms.append((iterator.docFreq(), term)) except StopIteration: pass return terms
def get_dicts(self, reader, tweets, num_docs): """ investigate index by constructing term dict (term,id) and idf dict (term,idf_val). """ terms_dict = {} idf_dict = {} terms_ctr = 0 # iterate over each term in index term_enum = MultiFields.getTerms(reader, "contents").iterator(None) for bytes_ref in util.BytesRefIterator.cast_(term_enum): s = bytes_ref.utf8ToString() terms_dict[s] = terms_ctr terms_ctr += 1 # count occurences of this term in the index and calculate idf doc_presence_ctr = 0 for tweet in tweets: if s in tweet[1].text.lower(): doc_presence_ctr += 1 idf_dict[s] = log(float(num_docs) / doc_presence_ctr, 10) return terms_dict, idf_dict
def evaluate_index(data_dir, store_dir, analyzer): """ Evaluates vocabulary size and indexing speed for different analyzer configurations. """ start = time.clock() Indexer(data_dir, store_dir, analyzer) end = time.clock() duration = end-start directory = SimpleFSDirectory(File(store_dir)) reader = IndexReader.open(directory) vocabulary = MultiFields.getTerms(reader, 'title') vocab_size = vocabulary.size() # sometimes .size() doesn't return the correct size, in this case # we have to count manually if vocab_size == -1: termsref = BytesRefIterator.cast_(vocabulary.iterator(None)) vocab_size = sum(1 for _ in termsref) reader.close() return duration, vocab_size
from org.apache.lucene.index import MultiFields from org.apache.lucene.util import BytesRef, BytesRefIterator import timeit if __name__ == '__main__': lucene.initVM(vmargs=['-Djava.awt.headless=true']) print('lucene', lucene.VERSION) start_time = timeit.default_timer() try: index = IndexFiles(os.path.dirname(sys.argv[0]), StandardAnalyzer()) index_reader = DirectoryReader.open(index.store) # get vocab size terms = MultiFields.getTerms(index_reader, 'contents') termEnum = terms.iterator() vocabCounter = 0 for term in BytesRefIterator.cast_(termEnum): vocabCounter += 1 print("Number of docs:", index_reader.numDocs()) print("Vocab size:", vocabCounter) # print min, max, mean querystr = 'بازار بزرگ تهران' print("Query: ", querystr) q = QueryParser("contents", index.analyzer).parse(querystr) hitsPerPage = 20 searcher = IndexSearcher(index_reader) docs = searcher.search(q, hitsPerPage) hits = docs.scoreDocs