def evaluate_index(index_dir, context, analyzer): # eval time of indexing (overall) # we should also measure the elapsed time of # each index_document call seperately start = time.clock() Indexer(index_dir, context, analyzer) end = time.clock() duration = end-start directory = SimpleFSDirectory(File(index_dir)) reader = IndexReader.open(directory) vocabulary = MultiFields.getTerms(reader, 'title') vocab_size = vocabulary.size() if vocab_size == -1: termsref = BytesRefIterator.cast_(vocabulary.iterator(None)) vocab_size = sum(1 for _ in termsref) # print str(vocab_size) # size of vocabulary # print str(vocabulary.getDocCount()) # #docs that have at least one term for title field # print str(vocabulary.getSumTotalTermFreq()) # #tokens # print str(vocabulary.getSumDocFreq()) # #postings reader.close() return duration, vocab_size
def get_doc_termvector(self, lucene_doc_id, field): """Outputs the document term vector as a generator.""" terms = self.reader.getTermVector(lucene_doc_id, field) if terms: termenum = terms.iterator(None) for bytesref in BytesRefIterator.cast_(termenum): yield bytesref.utf8ToString(), termenum
def getTermVectors(route): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) ls = [] for doc in range(ireader.numDocs()): vector = FreqVector() vector.vector = [] vector.freqs = [] norm = 0.0 terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() tf = 1 + math.log(termsEnum.totalTermFreq(), 2) t = Term("content", term) idf = math.log(ireader.numDocs() / ireader.docFreq(t)) vector.vector.append(text) vector.freqs.append(tf * idf) norm += (tf * idf) * (tf * idf) ls.append((vector, math.sqrt(norm))) else: ls.append((vector, 0)) return ls
def getMostFrequentTermNoStopwords(route, query): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) currentTerm = "" currentTermFreq = 0 for doc in range(ireader.numDocs()): terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() t = Term("content", term) freq = ireader.totalTermFreq(t) if freq > currentTermFreq and text not in query: currentTerm = text currentTermFreq = freq return currentTerm
def read_vectors(): for doc in range(0, reader.numDocs()): for fieldName in FIELD_NAMES: terms = reader.getTermVector(doc, fieldName) if terms: termsEnum = terms.iterator(None) vectors[fieldName][doc] = \ set(term.utf8ToString() for term in BytesRefIterator.cast_(termsEnum))
def get_coll_termvector(self, field): """ Returns collection term vector for the given field.""" self.open_reader() fields = MultiFields.getFields(self.reader) if fields is not None: terms = fields.terms(field) if terms: termenum = terms.iterator(None) for bytesref in BytesRefIterator.cast_(termenum): yield bytesref.utf8ToString(), termenum
def test_bug1842(self): reader = self.getReader() searcher = self.getSearcher() q = TermQuery(Term("id", '1')) topDocs = searcher.search(q, 50) termvec = reader.getTermVector(topDocs.scoreDocs[0].doc, "all") terms = [] freqs = [] termsEnum = termvec.iterator() for term in BytesRefIterator.cast_(termsEnum): terms.append(term.utf8ToString()) freqs.append(termsEnum.totalTermFreq()) terms.sort() self.assert_(terms == ['blah', 'gesundheit']) self.assert_(freqs == [3, 1])
def test_FieldEnumeration(self): self.test_indexDocument() store = self.openStore() writer = None try: analyzer = self.getAnalyzer() writer = self.getWriter(store, analyzer, False) doc = Document() doc.add(Field("title", "value of testing", TextField.TYPE_STORED)) doc.add(Field("docid", str(2), StringField.TYPE_NOT_STORED)) doc.add(Field("owner", "unittester", StringField.TYPE_STORED)) doc.add(Field("search_name", "wisdom", StoredField.TYPE)) doc.add(Field("meta_words", "rabbits are beautiful", TextField.TYPE_NOT_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("owner", "unittester", StringField.TYPE_NOT_STORED)) doc.add(Field("search_name", "wisdom", StoredField.TYPE)) doc.add(Field("meta_words", "rabbits are beautiful", TextField.TYPE_NOT_STORED)) writer.addDocument(doc) finally: self.closeStore(store, writer) store = self.openStore() reader = None try: reader = DirectoryReader.open(store) term_enum = MultiFields.getTerms(reader, "docid").iterator() docids = [term.utf8ToString() for term in BytesRefIterator.cast_(term_enum)] self.assertEqual(len(docids), 2) finally: self.closeStore(store, reader)
def getFreqVectorFromText(self, text): # Initialization of Java Virtual Machine with Lucene vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() indexDir = "res/index" stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(indexDir)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) rebuild = True if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) doc = Document() doc.add(Field("docName", 'url', Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("content", text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close() ireader = IndexReader.open(directory) freqVector = [] docVector = ireader.getTermVector(0, "content") termsEnum = docVector.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() freq = termsEnum.totalTermFreq() freqVector.append((text, freq)) freqVector = sorted(freqVector, key=itemgetter(1), reverse=True) self.vector = list() self.freqs = list() for el in freqVector: self.vector.append(el[0]) self.freqs.append(el[1])
def get_doc_termfreqs_all_fields(self, lucene_doc_id): """ Returns term frequency for all fields in the given document. :param lucene_doc_id: Lucene document ID :return: dictionary {field: {term: freq, ...}, ...} """ doc_termfreqs = {} vectors = self.reader.getTermVectors(lucene_doc_id) if vectors: for field in vectors.iterator(): doc_termfreqs[field] = {} terms = vectors.terms(field) if terms: termenum = terms.iterator(None) for bytesref in BytesRefIterator.cast_(termenum): doc_termfreqs[field][bytesref.utf8ToString()] = int(termenum.totalTermFreq()) print doc_termfreqs[field] return doc_termfreqs
def termsForField(self, field, prefix=None, limit=10, **kwargs): convert = lambda term: term.utf8ToString() terms = [] termsEnum = MultiFields.getTerms(self._indexAndTaxonomy.searcher.getIndexReader(), field) if termsEnum is None: return terms iterator = termsEnum.iterator(None) if prefix: iterator.seekCeil(BytesRef(prefix)) terms.append((iterator.docFreq(), convert(iterator.term()))) bytesIterator = BytesRefIterator.cast_(iterator) try: while len(terms) < limit: term = convert(bytesIterator.next()) if prefix and not term.startswith(prefix): break terms.append((iterator.docFreq(), term)) except StopIteration: pass return terms
def evaluate_index(data_dir, store_dir, analyzer): """ Evaluates vocabulary size and indexing speed for different analyzer configurations. """ start = time.clock() Indexer(data_dir, store_dir, analyzer) end = time.clock() duration = end-start directory = SimpleFSDirectory(File(store_dir)) reader = IndexReader.open(directory) vocabulary = MultiFields.getTerms(reader, 'title') vocab_size = vocabulary.size() # sometimes .size() doesn't return the correct size, in this case # we have to count manually if vocab_size == -1: termsref = BytesRefIterator.cast_(vocabulary.iterator(None)) vocab_size = sum(1 for _ in termsref) reader.close() return duration, vocab_size
ft.setStoreTermVectorPositions(True) ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) ts = ["this bernhard is the text to be index text", "this claudia is the text to be indexed"] for t in ts: doc = Document() doc.add(Field("fieldname", t, ft)) iwriter.addDocument(doc) iwriter.commit() iwriter.close() ireader = DirectoryReader.open(directory) for doc in xrange(0, len(ts)): tv = ireader.getTermVector(doc, "fieldname") termsEnum = tv.iterator() for term in BytesRefIterator.cast_(termsEnum): dpEnum = termsEnum.postings(None) dpEnum.nextDoc() # prime the enum which works only for the current doc freq = dpEnum.freq() print 'term:', term.utf8ToString() print ' freq:', freq for i in xrange(freq): print " pos:", dpEnum.nextPosition() print " off: %i-%i" %(dpEnum.startOffset(), dpEnum.endOffset()) print
def search_query_with_relevance_feedback(self, query, feedback_qrels, num_returns=50, add_num=1): query_text = query["description"] print(query_text) query_text = " ".join(tokenizer.tokenize(query_text)) query_text = self.remove_stopwords(query_text.lower()) print(query_text) query_number = query["Number"] qrel_doc_ids = [ qrel["docno"] for qrel in feedback_qrels if qrel["qid"] == query_number ] final_list = [] term_tf_idf = {} doc_count = len(qrel_doc_ids) for qrel_doc_id in qrel_doc_ids: initial_hit = self.feedback_searcher.search( TermQuery(Term(".U", qrel_doc_id)), 1).scoreDocs if len(initial_hit) == 0: continue assert len(initial_hit) == 1 termVector = self.reader.getTermVector(initial_hit[0].doc, "text") terms_enum = termVector.iterator() termsref = BytesRefIterator.cast_(terms_enum) N_terms = 0 term_idf = {} term_freq = {} term_list = [] while (termsref.next()): termval = TermsEnum.cast_(termsref) termText = termval.term().utf8ToString() if termText in self.stopwords: continue tc = termval.totalTermFreq() if termText in term_freq: term_freq[termText] += tc else: term_freq[termText] = tc if termText in term_idf: term_idf[termText] += 1 else: term_idf[termText] = 1 if termText not in term_list: term_list.append(termText) N_terms = N_terms + 1 for term in term_list: if term in term_tf_idf: term_tf_idf[term] += term_freq[term] / N_terms * ( 1 + math.log(doc_count / (term_idf[term] + 1))) else: term_tf_idf[term] = term_freq[term] / N_terms * ( 1 + math.log(doc_count / (term_idf[term] + 1))) sorted_tf_idf = sorted(term_tf_idf.items(), key=lambda x: x[1], reverse=True) for each in sorted_tf_idf: if each[0] not in self.stopwords and not str(each[0]).isnumeric( ) and each[0] not in query_text.split(" "): final_list.append(each[0]) print(final_list[:add_num]) query_text = query_text + " " + " ".join(final_list[:add_num]) query_text = " ".join(query_text.split(" ")) print(query_text) query_search = self.parser.parse(query_text) results = self.searcher.search(query_search, num_returns) hits = results.scoreDocs trec_results = [] for rank, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) trec_result = { "QueryID": query["Number"], "Q0": "Q0", "DocID": doc.get(".U"), "Rank": str(rank + 1), "Score": str(hit.score), "RunID": self.similarity } trec_results.append(trec_result) return trec_results
ts = [ "this bernhard is the text to be index text", "this claudia is the text to be indexed" ] for t in ts: doc = Document() doc.add(Field("fieldname", t, ft)) iwriter.addDocument(doc) iwriter.commit() iwriter.close() ireader = DirectoryReader.open(directory) for doc in xrange(0, len(ts)): tv = ireader.getTermVector(doc, "fieldname") termsEnum = tv.iterator(None) for term in BytesRefIterator.cast_(termsEnum): dpEnum = termsEnum.docsAndPositions(None, None) dpEnum.nextDoc() # prime the enum which works only for the current doc freq = dpEnum.freq() print 'term:', term.utf8ToString() print ' freq:', freq for i in xrange(freq): print " pos:", dpEnum.nextPosition() print " off: %i-%i" % (dpEnum.startOffset(), dpEnum.endOffset()) print
def getMostFrequentTermStopwords(route, query): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "no", "not", "more", "http", "html", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "el", "la", "lo", "los", "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y", "los", ] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) currentTerm = "" currentTermFreq = 0 for doc in range(ireader.numDocs()): terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString().encode("UTF-8") t = Term("content", term) freq = ireader.totalTermFreq(t) if freq > currentTermFreq and text not in query: currentTerm = text currentTermFreq = freq return currentTerm