def build_index(): lucene.initVM() # post_dir = current_app.config['LOCAL_REPO_PATH'] + '/_posts/' post_dir = '/Users/w3/data/github/codeif_backup' index_store_dir = current_app.config['INDEX_STORE_DIR'] print post_dir print index_store_dir analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) store = SimpleFSDirectory(File(index_store_dir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) indexDocs(post_dir, writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def __init__(self, root, store_dir): if not os.path.exists(store_dir): os.mkdir(store_dir, 0777) # NOTE: Hardcoded the analyzer instead of passing it lucene.initVM() ''' vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() ''' analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) store = SimpleFSDirectory(File(store_dir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) # Set the permissions to 777 for the index directory and the write.lock file chmod_indexdir_cmd = "chmod 0777 " + store_dir writelock_file = store_dir + "/" + "write.lock" chmod_writelock_cmd = "chmod 0777 " + writelock_file if os.path.exists(store_dir): cicmd=os.popen("sudo -S %s"%(chmod_indexdir_cmd), 'w').write('vagrant') if os.path.exists(writelock_file): cwcmd=os.popen("sudo -S %s"%(chmod_writelock_cmd), 'w').write('vagrant') # setting CREATE will rewrite over the existing indexes. ###config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) writer.close()
def __init__(self,root,storeDir,analyzer): # Create the index dir if it does not exist if not os.path.exists(storeDir): os.mkdir(storeDir) # the SimpleFSDirectory which the index will be written in store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer,1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT,analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) # create a index writer # atach the index dir and config info to it writer = IndexWriter(store,config) # call the indexing procedure # indexing all the files in the directory specified by root # write the index with writer self.indexDocs(root,writer) # start a ticker ticker = Ticker() print 'commit index' threading.Thread(target=ticker.run).start() writer.commit() writer.close() # stop the ticker when the indexing procedure completes ticker.tick = False print 'Done'
def index(indexdir): lucene.initVM() indexDir = SimpleFSDirectory(File(indexdir)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer()) writer = IndexWriter(indexDir, writerConfig) f = open('data/docid.documento-xml.txt') st = PorterStemmer() for i, line in enumerate(f.readlines()): id, xmltext = line.split('\t') xmltext = xmltext.rstrip('\n') xmldoc = minidom.parseString(xmltext) title = xmldoc.getElementsByTagName("TITLE") title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue authors = xmldoc.getElementsByTagName("AUTHORS") authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue abstract = xmldoc.getElementsByTagName("ABSTRACT") abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue doc = Document() doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) print "indexed %s docs" % (i+1) writer.close()
def reindex(self): writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.corpus.analyzer, False, IndexWriter.MaxFieldLength.LIMITED) indexutils.reindex_all(self.reader, writer, self.corpus.analyzer) writer.optimize() writer.close() self.parent.write({'message': "Reindex successful. Corpus analyzer is now set to %s." % (self.corpus.analyzer_str,)}) self.parent.write({'status': "Ready!"})
class LuceneIndexer: def __init__(self, path_to_save): self.path_to_save = path_to_save self.num_docs = 0 lucene.initVM() self.indexDir = SimpleFSDirectory(File(self.path_to_save)) self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) self.analyzer2 = WhitespaceAnalyzer(Version.LUCENE_4_10_1) self.writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, self.analyzer2) self.writer = IndexWriter(self.indexDir, self.writerConfig) def add_document(self, fields, header, id_): doc = Document() if len(fields) > len(header): sys.stderr.write('SKIPPED_DOC\tunexpected_num_lines\t%s\n' % str(id_)) for field in fields: sys.stderr.write('%s\n' % field) return for idx, field in enumerate(fields): fname, fieldtype = header[idx] if fieldtype is IntField: field = int(field) doc.add(fieldtype(fname, field, Field.Store.YES)) self.writer.addDocument(doc) self.num_docs += 1 def close(self): print 'Indexed %d lines from stdin (%d docs in index)' % (self.num_docs, self.writer.numDocs()) self.writer.close()
def _getLucene(self, path): directory = FSDirectory.open(Paths.get(path)) config = IndexWriterConfig(None) config.setRAMBufferSizeMB(256.0) # faster config.setUseCompoundFile(False) # faster, for Lucene 4.4 and later writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher
def getLucene(path): directory = FSDirectory.open(Paths.get(path)) analyzer = WhitespaceAnalyzer() config = IndexWriterConfig(analyzer) config.setIndexSort(Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG))) writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher
def wikipedia_indexer(storage, wikipedia_file) : lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" f = open(wikipedia_file) for i, line in enumerate(f) : text = line.strip().decode('utf-8').split('\t') title = text[0] if 'disambigu' in text[0] or len(text) < 2: continue text = text[1] doc = Document() doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO)) doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) if writer.numDocs() % 1000 == 0 : print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def getLucene(path): directory = FSDirectory.open(File(path)) analyzer = WhitespaceAnalyzer() config = IndexWriterConfig(Version.LATEST, analyzer) mergePolicy = config.getMergePolicy() sortingMergePolicy = SortingMergePolicy(mergePolicy, Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG))) config.setMergePolicy(sortingMergePolicy) writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher
def make_index(indexed_data, index_destination, source='directory'): #index wiki articles based on ck 12 topics #analyzer = StandardAnalyzer(Version.LUCENE_30) analyzer = SnowballAnalyzer(Version.LUCENE_30, "English", StandardAnalyzer.STOP_WORDS_SET) indexWriterConfig = IndexWriterConfig(Version.LUCENE_30, analyzer) writer = IndexWriter(SimpleFSDirectory(File(index_destination)), indexWriterConfig) if source == 'directory': indexDirectory(indexed_data, writer) else: indexDictionary(indexed_data, writer) writer.close()
class Indexer(object): def __init__(self, **kwargs): """ Initialize a new instance of the Indexer :param output: The output directory of the underlying index :param anaylzer: The overloaded analyzer to work with """ self.output = kwargs.get("root", "index") if not os.path.exists(self.output): os.mkdir(self.output) self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT)) self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576) self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.store = SimpleFSDirectory(File(self.output)) self.writer = IndexWriter(self.store, self.config) self.create_field_types() def index(self, document): """ Given a new document, add it to the index. :param document: The document to add to the indexer """ try: self.writer.addDocument(document) except Exception: logger.exception("Failed to index the supplied document") def shutdown(self): """ Shutdown the currently processing indexer. """ try: # self.writer.optimize() self.writer.close() except Exception: logger.exception("Failed to shutdown the indexer correctly") def create_field_types(self): """ Create the field types that will be used to specify what actions lucene should take on the various fields supplied to index. """ self.field_clean = FieldType() self.field_clean.setIndexed(True) self.field_clean.setStored(True) self.field_clean.setTokenized(False) self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) self.field_dirty = FieldType() self.field_dirty.setIndexed(True) self.field_dirty.setStored(False) self.field_dirty.setTokenized(True) self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) writer.commit() writer.close()
def import_csv_with_content(self, csv_file, content_field): try: writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.analyzer, False, IndexWriter.MaxFieldLength.LIMITED) changed_rows = addmetadata.add_metadata_and_content_from_csv(self.searcher, self.reader, writer, csv_file, content_field, self.args_dir) writer.close() except UnicodeDecodeError: try: writer.close() except: pass self.parent.write({'error': 'CSV import failed: file contained non-unicode characters. Please save the file with UTF-8 encoding and try again!'}) return self.parent.write({'message': "CSV import complete: %s rows added." % (changed_rows,)})
def __init__(self, indexPath): """Instantiate the handler object.""" self.indexPath = indexPath self.analyzer = StopAnalyzer() # Make sure the path exists if not os.path.exists(self.indexPath): os.mkdir(self.indexPath) if not os.path.exists(os.path.join(self.indexPath, 'segments.gen')): log('Creating new index.') writer = IndexWriter(self.indexPath, self.analyzer, 1) writer.close()
def create_index(storage, paths) : lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" import os for path in paths : for filen in os.listdir(path) : text = sent_tokenize(get_data_from_file(path + filen)) total_sent = len(text) for i in range(0, total_sent, 3) : doc = Document() a = i-5 if i-5 > 0 else 0 sentence = ' '.join(text[a:i+5]) doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print("Done %s" % (path+filen)) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def lucene_index(texts): """ :param corpus_file_path: :param f_type: :return: """ index = set_lucene_index['ind'] # nonlocal variable index config = IndexWriterConfig(version, analyzer) writer = IndexWriter(index, config) for t in texts: addDoc(writer, t) writer.close()
def create_index(): lucene.initVM() if os.path.exists(prm.index_folder): shutil.rmtree(prm.index_folder) indexDir = SimpleFSDirectory(File(prm.index_folder)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) wk = wiki.Wiki(prm.pages_path) print "%d docs in index" % writer.numDocs() print "Reading files from wikipedia..." n = 0 for l in wk.get_text_iter(): doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) n += 1 if n % 100000 == 0: print 'indexing article', n print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def index(analyzer, index_dest_dir, documents): """ Builds Lucene index from provided documents using given analyzer :param analyzer: :param index_dest_dir: :param list[Document] documents: :return: """ if not all([isinstance(d, Document) for d in documents]): raise TypeError("documents should be iterable of type Document! Given: %s" % type(documents[0])) writer_config = IndexWriterConfig(Version.LUCENE_30, analyzer) writer = IndexWriter(SimpleFSDirectory(File(index_dest_dir)), writer_config) for doc in documents: writer.addDocument(doc) writer.close()
def __init__(self, path, settings): self._settings = settings self._multithreaded = settings.multithreaded self._checker = DirectSpellChecker() indexDirectory = MMapDirectory(File(join(path, 'index'))) indexDirectory.setUseUnmap(False) taxoDirectory = MMapDirectory(File(join(path, 'taxo'))) taxoDirectory.setUseUnmap(False) conf = IndexWriterConfig(Version.LUCENE_4_10_0, settings.analyzer) conf.setSimilarity(settings.similarity) mergePolicy = TieredMergePolicy() mergePolicy.setMaxMergeAtOnce(settings.maxMergeAtOnce) mergePolicy.setSegmentsPerTier(settings.segmentsPerTier) conf.setMergePolicy(mergePolicy) if not settings.readonly: self._indexWriter = IndexWriter(indexDirectory, conf) self._indexWriter.commit() self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(settings.lruTaxonomyWriterCacheSize)) self._taxoWriter.commit() self._indexAndTaxonomy = IndexAndTaxonomy(settings, indexDirectory, taxoDirectory) self._readerSettingsWrapper = self._indexAndTaxonomy._readerSettingsWrapper self._facetsConfig = settings.fieldRegistry.facetsConfig self._ordinalsReader = CachedOrdinalsReader(DocValuesOrdinalsReader())
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1000)#1048576 config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def rollback(collection_name): if collection_name!="DEFAULT": INDEX_DIR=collection_name else: INDEX_DIR=INDEX_DIR_DEFAULT direc=SimpleFSDirectory(File(INDEX_DIR)) analyzer=StandardAnalyzer(Version.LUCENE_CURRENT) #setting writer configurations config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer=IndexWriter(direc,config) writer.rollback() writer.close()
def __init__(self, writerConfig, indexDir): lucene.initVM() self.mIndexDir = SimpleFSDirectory(File(indexDir)) self.mConfig = writerConfig self.mWriter = IndexWriter(self.mIndexDir, self.mConfig)
def open_writer(self): """Open IndexWriter.""" if self.writer is None: config = IndexWriterConfig(self.get_version(), self.get_analyzer()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.dir, config) else: raise Exception("IndexWriter is already open")
def buildIndex(self, inputFile): analyzer = self.getAnalyzer() iwconf = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) iwconf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter( SimpleFSDirectory( File(self.luceneDir) ), iwconf) # read through input file and write out to lucene counter = 0 linesReadCounter = 0 with open(inputFile, 'r') as lines: linesRead = 0 for line in lines: try: linesRead+=1 if linesRead % 1000 == 0: print "%d lines read" % linesRead cui, concept = line.replace("\",\"", "\t").replace("\"", "").split("\t") concept = concept.strip() cui = cui.strip() strNorm = self.normalizeCasePunct(concept) strSorted = self.sortWords(strNorm) strStemmed = self.stemWords(strNorm) strStemmedSorted = self.stemWords(strSorted) fdoc = Document() counter +=1 fid = counter fdoc.add( Field("id", unicode(fid), Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("cui", cui, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str", concept, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_norm", strNorm, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_sorted", strSorted, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_stemmed", strStemmed, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_stemmedSorted", strStemmedSorted, Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(fdoc) if fid % 1000 == 0: writer.commit() except: "Skipping line: %s" % line writer.commit() writer.close()
def __init__(self, destination_directory, analyzer): if not os.path.exists(destination_directory): os.mkdir(destination_directory) store = SimpleFSDirectory(File(destination_directory)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.tweetIndexer(writer) ticker = Ticker() threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def __init__(self, path_to_save): self.path_to_save = path_to_save self.num_docs = 0 lucene.initVM() self.indexDir = SimpleFSDirectory(File(self.path_to_save)) self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) self.analyzer2 = WhitespaceAnalyzer(Version.LUCENE_4_10_1) self.writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, self.analyzer2) self.writer = IndexWriter(self.indexDir, self.writerConfig)
def __init__(self, store_dir): self.store_dir = store_dir if not os.path.exists(store_dir): os.mkdir(store_dir, 0777) self.store = SimpleFSDirectory(Paths.get(store_dir)) self.analyzer = StandardAnalyzer() self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576) self.config = IndexWriterConfig(self.analyzer) self.writer = IndexWriter(self.store, self.config)
def __init__(self, fileRoot, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setSimilarity(similarities.BM25Similarity()) #Available similarity: BM25Similarity, MultiSimilarity, PerFieldSimilarityWrapper, SimilarityBase, TFIDFSimilarity config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(fileRoot, writer) print 'commit index', writer.commit() writer.close() print 'done'
def createIndexWriter(self, actual_dir, max_field_length=20000000): """ Returns an IndexWriter object created for the actual_dir specified """ ensureDirExists(actual_dir) index = SimpleFSDirectory(File(actual_dir)) analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT) writerConfig=IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, analyzer) similarity=FieldAgnosticSimilarity() writerConfig.setSimilarity(similarity) writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE) ## res= IndexWriter(index, analyzer, True, IndexWriter.MaxFieldLength(max_field_length)) res= IndexWriter(index, writerConfig) res.deleteAll() return res
sheet1 = wb.sheet_by_index(1) print('initializing Lucene VM') lucene.initVM() print('lucene version ', lucene.VERSION) index_path = Paths.get('./lucene.index') question_field = 'question' answer_field = 'answer' index_store = SimpleFSDirectory(index_path) # analyzer = StandardAnalyzer() analyzer = PersianAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(index_store, config) TokenizeFields = True # Question field type qft = FieldType() # qft.setIndexed(True) # todo qft.setStored(True) qft.setTokenized(TokenizeFields) qft.setIndexOptions(IndexOptions.DOCS_AND_FREQS) # Answer field type aft = FieldType() # aft.setIndexed(False) # todo aft.setStored(True)
doc.add( Field("keywords", ' '.join((command, name, synopsis, description)), TextField.TYPE_NOT_STORED)) doc.add(Field("filename", os.path.abspath(path), StringField.TYPE_STORED)) writer.addDocument(doc) if __name__ == '__main__': if len(sys.argv) != 2: print("Usage: python manindex.py <index dir>") else: lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = SimpleFSDirectory(Paths.get(sys.argv[1])) analyzer = StandardAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, 10000) config = IndexWriterConfig(analyzer) writer = IndexWriter(directory, config) manpath = os.environ.get('MANPATH', '/usr/share/man').split(os.pathsep) for dir in manpath: print("Crawling", dir) for name in os.listdir(dir): path = os.path.join(dir, name) if os.path.isdir(path): indexDirectory(path) writer.commit() writer.close()
# indexWriterConfig = IndexWriterConfig(Version.LUCENE_30, analyzer) # writer = IndexWriter(SimpleFSDirectory(File("data/index/ck12_books_paragraphs")), indexWriterConfig) # indexDictionary(docs, writer) # # writer.close() dir_name = 'data/ck12_book/OEBPS' docs = {} html_paths = [os.path.join(dir_name, str(i + 1) + '.html') for i in range(124)] # for f_name in html_paths: docs.update(book_processing.get_h_all_text(open(f_name).read())) for fname in os.listdir("data/allwiki"): content = open(os.path.join("data", "allwiki", fname)).read() i = 0 for doc in content.split("\n"): if len(doc.strip()) > 0: docs["%s%d" % (fname, i)] = doc i += 1 #analyzer = StandardAnalyzer(Version.LUCENE_30) analyzer = SnowballAnalyzer(Version.LUCENE_30, "English", StandardAnalyzer.STOP_WORDS_SET) indexWriterConfig = IndexWriterConfig(Version.LUCENE_30, analyzer) writer = IndexWriter(SimpleFSDirectory(File("data/index/combo4")), indexWriterConfig) indexDictionary(docs, writer) writer.close()
luceneIndexPath = '/home/tarun/PE/lucene/luceneIndexDirectoryNewCorpus60/' corpus = '/home/tarun/PE/newCorpus60/' trainingFilePath = '/home/tarun/PE/Dataset/training_set.tsv' lucene.initVM() # ANALYZER analyzer = StandardAnalyzer(util.Version.LUCENE_CURRENT) # DIRECTORY directory = SimpleFSDirectory(File(luceneIndexPath)) # INDEX WRITER writerConfig = IndexWriterConfig(util.Version.LUCENE_CURRENT, analyzer) writer = IndexWriter(directory, writerConfig) print writer.numDocs() # INDEXING ALL DOCUMENTS/ARTICLES IN THE CORPUS for fileName in os.listdir(corpus): #print fileName document = Document() article = os.path.join(corpus, fileName) content = open(article, 'r').read() document.add(Field("text", content, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(document) print writer.numDocs() writer.close() # INDEX READER reader = IndexReader.open(directory)
def __init__(self, indexDir): self.directory = SimpleFSDirectory(Paths.get(indexDir)) self.analyzer = StandardAnalyzer() # analyzer = LimitTokenCountAnalyzer(analyzer, 10000) self.config = IndexWriterConfig(self.analyzer) self.writer = IndexWriter(self.directory, self.config)
class Index(object): def __init__(self, path, settings): self._settings = settings self._multithreaded = settings.multithreaded self._checker = DirectSpellChecker() indexDirectory = MMapDirectory(File(join(path, 'index'))) indexDirectory.setUseUnmap(False) taxoDirectory = MMapDirectory(File(join(path, 'taxo'))) taxoDirectory.setUseUnmap(False) conf = IndexWriterConfig(Version.LUCENE_4_10_0, settings.analyzer) conf.setSimilarity(settings.similarity) mergePolicy = TieredMergePolicy() mergePolicy.setMaxMergeAtOnce(settings.maxMergeAtOnce) mergePolicy.setSegmentsPerTier(settings.segmentsPerTier) conf.setMergePolicy(mergePolicy) if not settings.readonly: self._indexWriter = IndexWriter(indexDirectory, conf) self._indexWriter.commit() self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(settings.lruTaxonomyWriterCacheSize)) self._taxoWriter.commit() self._indexAndTaxonomy = IndexAndTaxonomy(settings, indexDirectory, taxoDirectory) self._readerSettingsWrapper = self._indexAndTaxonomy._readerSettingsWrapper self._facetsConfig = settings.fieldRegistry.facetsConfig self._ordinalsReader = CachedOrdinalsReader(DocValuesOrdinalsReader()) def addDocument(self, term, document): document = self._facetsConfig.build(self._taxoWriter, document) self._indexWriter.updateDocument(term, document) def deleteDocument(self, term): self._indexWriter.deleteDocuments(term) def search(self, query, filter, collector): self._indexAndTaxonomy.searcher.search(query, filter, collector) def suggest(self, query, count, field): suggestions = {} for token, startOffset, endOffset in self._analyzeToken(query): suggestWords = self._checker.suggestSimilar(Term(field, token), count, self._indexAndTaxonomy.searcher.getIndexReader()) if suggestWords: suggestions[token] = (startOffset, endOffset, [suggestWord.string for suggestWord in suggestWords]) return suggestions def termsForField(self, field, prefix=None, limit=10, **kwargs): convert = lambda term: term.utf8ToString() terms = [] termsEnum = MultiFields.getTerms(self._indexAndTaxonomy.searcher.getIndexReader(), field) if termsEnum is None: return terms iterator = termsEnum.iterator(None) if prefix: iterator.seekCeil(BytesRef(prefix)) terms.append((iterator.docFreq(), convert(iterator.term()))) bytesIterator = BytesRefIterator.cast_(iterator) try: while len(terms) < limit: term = convert(bytesIterator.next()) if prefix and not term.startswith(prefix): break terms.append((iterator.docFreq(), term)) except StopIteration: pass return terms def fieldnames(self): indexAndTaxonomy = self._indexAndTaxonomy fieldnames = [] fields = MultiFields.getFields(indexAndTaxonomy.searcher.getIndexReader()) if fields is None: return fieldnames iterator = fields.iterator() while iterator.hasNext(): fieldnames.append(iterator.next()) return fieldnames def drilldownFieldnames(self, path=None, limit=50): taxoReader = self._indexAndTaxonomy.taxoReader parentOrdinal = TaxonomyReader.ROOT_ORDINAL if path is None else taxoReader.getOrdinal(path[0], path[1:]) childrenIter = taxoReader.getChildren(parentOrdinal) names = [] while True: ordinal = childrenIter.next() if ordinal == TaxonomyReader.INVALID_ORDINAL: break names.append(taxoReader.getPath(ordinal).components[-1]) if len(names) >= limit: break return names def numDocs(self): return self._indexAndTaxonomy.searcher.getIndexReader().numDocs() def commit(self): if not self._settings.readonly: self._taxoWriter.commit() self._indexWriter.commit() self._indexAndTaxonomy.reopen() def getDocument(self, docId): return self._indexAndTaxonomy.searcher.doc(docId) def createFacetCollector(self): if not self._multithreaded: return FacetsCollector() return FacetSuperCollector(self._indexAndTaxonomy.taxoReader, self._facetsConfig, self._ordinalsReader) def facetResult(self, facetCollector): facetResult = TaxonomyFacetCounts(self._ordinalsReader, self._indexAndTaxonomy.taxoReader, self._facetsConfig, facetCollector) return Facets.cast_(facetResult) def close(self): self._indexAndTaxonomy.close() if not self._settings.readonly: self._taxoWriter.close() self._indexWriter.close() def _analyzeToken(self, token): result = [] reader = StringReader(unicode(token)) stda = self._settings.analyzer ts = stda.tokenStream("dummy field name", reader) termAtt = ts.addAttribute(CharTermAttribute.class_) offsetAtt = ts.addAttribute(OffsetAttribute.class_) try: ts.reset() while ts.incrementToken(): result.append((termAtt.toString(), offsetAtt.startOffset(), offsetAtt.endOffset())) ts.end() finally: ts.close() return result
lucene.initVM(vmargs=['-Djava.awt.headless=true']) # indexing directory indexDir = FSDirectory.open(File("lucene_index.Index")) # input which will be indexed with Lucene title1 = "text of title1" title2 = "title2" abstract1 = "abstract1 has many words, e.g. hellow world can be the text" abstract2 = "text of abstract2" # configure indexing config = IndexWriterConfig(Version.LUCENE_CURRENT, WhitespaceAnalyzer(Version.LUCENE_CURRENT)) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) # count number of documents processed nDocsAdded = 0 # create first document doc = Document() doc.add(TextField("Title", title1, Field.Store.YES)) doc.add(TextField("Abstract", abstract1, Field.Store.YES)) iw.addDocument(doc) nDocsAdded += 1 # create second document doc = Document() doc.add(TextField("Title", title2, Field.Store.YES)) doc.add(TextField("Abstract", abstract2, Field.Store.YES))
class LuceneSearch(): def __init__(self): self.env = lucene.initVM(initialheap='28g', maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.vocab = None BooleanQuery.setMaxClauseCount(2048) if not os.path.exists(prm.index_folder): print 'Creating index at', prm.index_folder if prm.docs_path == prm.docs_path_term: add_terms = True else: add_terms = False self.create_index(prm.index_folder, prm.docs_path, add_terms) if prm.local_index_folder: print 'copying index from', prm.index_folder, 'to', prm.local_index_folder if os.path.exists(prm.local_index_folder): print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.' else: shutil.copytree(prm.index_folder, prm.local_index_folder) self.index_folder = prm.local_index_folder else: self.index_folder = prm.index_folder fsDir = MMapDirectory(Paths.get(prm.index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) if prm.docs_path != prm.docs_path_term: if not os.path.exists(prm.index_folder_term): print 'Creating index at', prm.index_folder_term self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True) if prm.local_index_folder_term: print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term if os.path.exists(prm.local_index_folder_term): print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.' else: shutil.copytree(prm.index_folder_term, prm.local_index_folder_term) self.index_folder_term = prm.local_index_folder_term else: self.index_folder_term = prm.index_folder_term fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term)) self.searcher_term = IndexSearcher( DirectoryReader.open(fsDir_term)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=prm.n_threads) self.cache = {} print 'Loading Title-ID mapping...' self.title_id_map, self.id_title_map = self.get_title_id_map() if prm.idf_path: print 'Loading IDF dictionary...' self.idf = pkl.load(open(prm.idf_path)) def get_title_id_map(self): # get number of docs n_docs = self.searcher.getIndexReader().numDocs() title_id = {} id_title = {} query = MatchAllDocsQuery() hits = self.searcher.search(query, n_docs) for hit in hits.scoreDocs: doc = self.searcher.doc(hit.doc) idd = int(doc['id']) title = doc['title'] title_id[title] = idd id_title[idd] = title return title_id, id_title def add_idf(self, txt): txt = utils.clean(txt) txt = txt.lower() df = set() for word in wordpunct_tokenize(txt): if word not in df: df.add(word) self.idf[word] += 1. def add_doc(self, doc_id, title, txt, add_terms): doc = Document() txt = utils.clean(txt) if add_terms: if prm.top_tfidf > 0: words_idx = [] words, _ = utils.top_tfidf(txt.lower(), self.idf, prm.top_tfidf, prm.min_term_freq) if len(words) == 0: words.append('unk') for w in words: if w in self.vocab: words_idx.append(self.vocab[w]) else: words_idx.append(-1) # unknown words. else: txt_ = txt.lower() words_idx, words = utils.text2idx2([txt_], self.vocab, prm.max_terms_per_doc) words_idx = words_idx[0] words = words[0] doc.add(Field("id", str(doc_id), self.t1)) doc.add(Field("title", title, self.t1)) doc.add(Field("text", txt, self.t2)) if add_terms: doc.add(Field("word_idx", ' '.join(map(str, words_idx)), self.t3)) doc.add(Field("word", '<&>'.join(words), self.t3)) self.writer.addDocument(doc) def create_index(self, index_folder, docs_path, add_terms=False): print 'Loading Vocab...' if not self.vocab: self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words) os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(False) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) if add_terms: if prm.top_tfidf > 0 or prm.idf_path: print 'Creating IDF dictionary...' self.idf = defaultdict(int) doc_id = 0 if docs_path.lower().endswith('.hdf5'): import corpus_hdf5 corpus = corpus_hdf5.CorpusHDF5(docs_path) for txt in corpus.get_text_iter(): self.add_idf(txt) if doc_id % 1000 == 0: print 'Creating IDF, doc', doc_id doc_id += 1 else: # ClueWeb09 import warc import gzip from bs4 import BeautifulSoup # list all files in the folder. paths = [] for root, directories, filenames in os.walk(docs_path): for filename in filenames: paths.append(os.path.join(root, filename)) for path in paths: with gzip.open(path, mode='rb') as gzf: for record in warc.WARCFile(fileobj=gzf): # remove html tags txt = BeautifulSoup( record.payload[:1000 * 1000], "lxml").get_text() # remove WARC headers. txt = '\n'.join(txt.split('\n')[10:]) self.add_idf(txt) if doc_id % 1000 == 0: print 'Creating IDF, doc', doc_id doc_id += 1 for key, val in self.idf.items(): self.idf[key] = math.log(float(doc_id) / val) pkl.dump(self.idf, open(prm.idf_path, 'wb')) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig(StandardAnalyzer()) self.writer = IndexWriter(fsDir, writerConfig) print "%d docs in index" % self.writer.numDocs() print "Indexing documents..." doc_id = 0 if docs_path.lower().endswith('.hdf5'): import corpus_hdf5 corpus = corpus_hdf5.CorpusHDF5(docs_path) for txt in corpus.get_text_iter(): title = corpus.get_article_title(doc_id) self.add_doc(doc_id, title, txt, add_terms) if doc_id % 1000 == 0: print 'indexing doc', doc_id doc_id += 1 else: # ClueWeb09 import warc import gzip from bs4 import BeautifulSoup # list all files in the folder. paths = [] for root, directories, filenames in os.walk(docs_path): for filename in filenames: paths.append(os.path.join(root, filename)) for path in paths: with gzip.open(path, mode='rb') as gzf: for record in warc.WARCFile(fileobj=gzf): if 'warc-trec-id' in record: title = record['warc-trec-id'] else: title = record['warc-record-id'] # remove html tags #txt = BeautifulSoup(record.payload[:1000*1000], "lxml").get_text() txt = record.payload[:1000 * 1000] # remove WARC headers. txt = '\n'.join(txt.split('\n')[10:]) self.add_doc(doc_id, title, txt, add_terms) if doc_id % 1000 == 0: print 'indexing doc', doc_id doc_id += 1 print "Index of %d docs..." % self.writer.numDocs() self.writer.close() def search_multithread(self, qs, max_cand, max_full_cand, searcher): self.max_cand = max_cand self.max_full_cand = max_full_cand self.curr_searcher = searcher out = self.pool.map(self.search_multithread_part, qs) return out def search_multithread_part(self, q): if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() if q in self.cache: return self.cache[q] else: try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) except: print 'Unexpected error when processing query:', str(q) print 'Using query "dummy".' q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) c = OrderedDict() hits = self.curr_searcher.search(query, self.max_cand) for i, hit in enumerate(hits.scoreDocs): doc = self.curr_searcher.doc(hit.doc) if i < self.max_full_cand: word_idx = map(int, doc['word_idx'].split(' ')) word = doc['word'].split('<&>') else: word_idx = [] word = [] c[int(doc['id'])] = [word_idx, word] return c def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher): out = [] for q in qs: if q in self.cache: out.append(self.cache[q]) else: try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace( 'NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse( QueryParser.escape(q)) except: print 'Unexpected error when processing query:', str(q) print 'Using query "dummy".' query = QueryParser("text", self.analyzer).parse( QueryParser.escape('dummy')) c = OrderedDict() hits = curr_searcher.search(query, max_cand) for i, hit in enumerate(hits.scoreDocs): doc = curr_searcher.doc(hit.doc) if i < max_full_cand: word_idx = map(int, doc['word_idx'].split(' ')) word = doc['word'].split('<&>') else: word_idx = [] word = [] c[int(doc['id'])] = [word_idx, word] out.append(c) return out def get_candidates(self, qs, max_cand, max_full_cand=None, save_cache=False, extra_terms=True): if not max_full_cand: max_full_cand = max_cand if prm.docs_path != prm.docs_path_term: max_cand2 = 0 else: max_cand2 = max_full_cand if prm.n_threads > 1: out = self.search_multithread(qs, max_cand, max_cand2, self.searcher) if (prm.docs_path != prm.docs_path_term) and extra_terms: terms = self.search_multithread(qs, max_full_cand, max_full_cand, self.searcher_term) else: out = self.search_singlethread(qs, max_cand, max_cand2, self.searcher) if (prm.docs_path != prm.docs_path_term) and extra_terms: terms = self.search_singlethread(qs, max_full_cand, max_full_cand, self.searcher_term) if (prm.docs_path != prm.docs_path_term) and extra_terms: for outt, termss in itertools.izip(out, terms): for cand_id, term in itertools.izip( outt.keys()[:max_full_cand], termss.values()): outt[cand_id] = term if save_cache: for q, c in itertools.izip(qs, out): if q not in self.cache: self.cache[q] = c return out
def testDelete(self, fieldName, searchString): config = IndexWriterConfig(Version.LUCENE_CURRENT, self.Analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) writer = IndexWriter(self.store, config) writer.deleteDocuments(Term(fieldName, searchString)) writer.close()
class LuceneHelper: def __init__(self, index_dir): self.index_dir = index_dir self.indexDir = SimpleFSDirectory(File(self.index_dir).toPath()) self.q_parser = QueryParser("", WhitespaceAnalyzer()) self.commit_max = 500000 self.__get_writer_searcher() def __get_writer_searcher(self): writerConfig = IndexWriterConfig() print(f"Codec : {writerConfig.getCodec()}") self.writer = IndexWriter(self.indexDir, writerConfig) self.reader = DirectoryReader.open(self.writer) self.searcher = IndexSearcher(self.reader) def __query(self, query_str, _max=10): if self.searcher is None: return None query_cmd = self.q_parser.parse(query_str) hits = self.searcher.search(query_cmd, _max) print( f"Found {hits.totalHits} document(s) that matched query :'{query_cmd}'" ) return hits def __count_docs(self, query_str): if self.searcher is None: return None query_cmd = self.q_parser.parse(query_str) total = self.searcher.count(query_cmd) print(f"Found {total} document(s) that matched query :'{query_cmd}'") return total def refresh_searcher(self): self.reader.close() self.reader = DirectoryReader.open(self.indexDir) self.searcher = IndexSearcher(self.reader) def index_stats(self): query_str = f"*:*" total_docs = self.__count_docs(query_str) if total_docs: print(f"There is at least total [{total_docs}] docs.") else: print("There is no index right now.") def delete_old_ttl(self): now_time = int(time.time()) # check how many docs expired ttl_query = LongPoint.newRangeQuery("ttl", 0, now_time - 1) total_docs = self.searcher.count(ttl_query) print(f"At least found {total_docs} document(s) are expired.") # delete expired docs self.writer.deleteDocuments(ttl_query) self.writer.commit() def add_doc(self, item_data): item_id = item_data['item_id'] ttl = item_data['ttl'] version = item_data.get('version', 'default') view_similar = json.dumps(item_data.get('view_similar', {})) view_prospective = json.dumps(item_data.get('view_prospective', {})) doc = Document() _id = hashlib.md5(f"{item_id}_{version}".encode('utf-8')).hexdigest() doc.add(StringField("id", _id, Field.Store.NO)) doc.add(LongPoint("ttl", ttl)) doc.add(StringField("version", version, Field.Store.YES)) doc.add(StringField("item_id", item_id, Field.Store.YES)) doc.add(StoredField("view_similar", view_similar)) doc.add(StoredField("view_prospective", view_prospective)) self.writer.updateDocument(Term("id", _id), doc) def commit(self): self.writer.commit() def close(self): self.writer.commit() self.reader.close() self.writer.close()
output_path = sys.argv[2] lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) analyzer_ws = WhitespaceAnalyzer(Version.LUCENE_4_10_1) std_path = "%s/lucene_full_standard/" % (output_path) ws_path = "%s/lucene_full_ws/" % (output_path) if os.path.exists(std_path): os.remove(std_path) if os.path.exists(ws_path): os.remove(ws_path) indexDir1 = SimpleFSDirectory(File(std_path)) indexDir2 = SimpleFSDirectory(File(ws_path)) writerConfig1 = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writerConfig2 = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer_ws) writer1 = IndexWriter(indexDir1, writerConfig1) writer2 = IndexWriter(indexDir2, writerConfig2) print "%d docs in index1" % writer1.numDocs() print "%d docs in index2" % writer2.numDocs() print "Reading lines from sys.stdin..." ftypes = open(LUCENE_TYPES_FILE, "w") for n, l in enumerate(sys.stdin): doc = Document() doc_lc = Document() fields = l.rstrip().split("\t") all_ = [] if n == 0: sys.stdout.write("TYPES_HEADER")
class Index: def __init__(self, folder=None, fields=[], similarity="tfidf"): self.jcc = lucene.initVM() if folder: self.directory = SimpleFSDirectory(File(folder)) else: self.directory = RAMDirectory() self.fields = {} for field in fields: ft = FieldType() for pname, pvalue in field.props.items(): setter = getattr(ft, "set" + pname.capitalize()) setter(pvalue) ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # ft.setOmitNorms(True) self.fields[field.name] = ft self.similarity = similarity.lower() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.writer = None self.searcher = None def attach_thread(self): self.jcc.attachCurrentThread() def open_writer(self): config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.directory, config) def add(self, **doc): if not self.writer: self.open_writer() d = Document() for field, value in doc.items(): # try : d.add(Field(field, value, self.fields[field])) # except Exception, e : # print # print "Fudeu" # pass self.writer.addDocument(d) def commit(self): self.writer.commit() def close(self): if self.writer: self.writer.close() def open_searcher(self): self.reader = DirectoryReader.open(self.directory) self.searcher = IndexSearcher(self.reader) if (self.similarity == "bm25"): self.searcher.setSimilarity(BM25Similarity()) def preprocess_query(self, query, fields, mode="ANY"): ''' Fix query according to provided mode. If the value is not supported, the query remains unchanged ''' terms = query.lower().strip().split() if mode == "ANY": query = " OR ".join(terms) elif mode == "ALL": query = " AND ".join(terms) else: print "Invalid mode parameter '%s'." % mode query = QueryParser.escape(query) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, self.analyzer) query = MultiFieldQueryParser.parse(parser, query) return query def search(self, query, search_fields, return_fields, filter=None, ignore=set(), mode="ANY", return_scores=False, limit=1000000): ''' Search documents in the index using a standard analyzer (tokenizes and removes top words). Supports two search modes: ANY and ALL ANY: include documents that contain at least one term of the query. ALL: include only documents that contain all terms of the query. ''' if not self.searcher: self.open_searcher() # Return empty results if query is empty (Lucene can't handle it nicely) if query.strip() == '': if return_scores: return [], [] else: return [] query = self.preprocess_query(query, search_fields, mode) # If limit is not provided, return all matched documents. A little hack is required # to do that. We query for one document and get the count total matched documents. # if not limit : # hits = self.searcher.search(query, 1) # limit = hits.totalHits # Fetch more than asked in case we have to remove entries from the ignore set if limit != None: limit += len(ignore) hits = self.searcher.search(query, filter, limit) hits = hits.scoreDocs docs = [] for hit in hits: doc = self.searcher.doc(hit.doc) if doc['id'] not in ignore: docs.append([doc[f] for f in return_fields]) if return_scores: scores = [hit.score for hit in hits] return docs[:limit], scores[:limit] return docs[:limit] def explain(self, query, fields, doc): if not self.searcher: self.open_searcher() query = QueryParser.escape(query) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, self.analyzer) query = MultiFieldQueryParser.parse(parser, query) return self.searcher.explain(query, doc) def get_documents(self, doc_ids, fields): docs = [] for doc_id in doc_ids: doc = self.reader.document(doc_id) if isinstance(fields, basestring): docs.append(doc.get(fields)) else: docs.append({f: doc.get(f) for f in fields}) return docs def get_query_scores(self, query, fields, doc_ids, mode="ANY"): # Creates pre-filter to ignore all other documents filter = TermsFilter([Term("id", id) for id in doc_ids]) query = self.preprocess_query(query, fields, mode) hits = self.searcher.search(query, filter, len(doc_ids)).scoreDocs # Creates scores' mapping using entity id instead of internal index id scores = { str(self.reader.document(hit.doc).get("id")): hit.score for hit in hits } # Normalize to 0..1 interval # n = 1.0/sum(scores.values()) # scores # Adds to the mapping entries for the non-returned docs (no term found) for doc_id in doc_ids: if doc_id not in scores: scores[doc_id] = 0.0 return scores
def main(): if len(sys.argv) < 2: print('error: too few arguments') print('command: python create_category_corpus.py NUMBER_TOP_CATEGORY') quit() NUMBER_TOP_CATEGORY = int(sys.argv[1]) print('NUMBER_TOP_CATEGORY=%d' % (NUMBER_TOP_CATEGORY)) print('loading category profiles') profile = load_zipped_pickle('category_profiles_dbpedia_201510.gz') print('finish loading category profiles') system_flag = platform.system() cwd = os.getcwd() # initialize mongo client if system_flag == 'Windows': client = pymongo.MongoClient("localhost", 27017) else: client = pymongo.MongoClient("localhost", 58903) db = client.wiki2015 wiki_article_categories = db['article_categories'] category_corpus = {} pkl_filename = 'category_dbpedia_corpus_top%d_fsdm3.pkl.gz' % ( NUMBER_TOP_CATEGORY) if system_flag == 'Windows': lucene_dbpedia_fsdm = Lucene_Object('mmapDirectory\\dbpedia_v2_FSDM3', 'BM25', True) else: lucene_dbpedia_fsdm = Lucene_Object( '%s/mmapDirectory/dbpedia_v2_FSDM3' % (cwd), 'BM25', True) cnt = 0 if os.path.exists(pkl_filename) == True: #if False==True: print('loading category corpus') category_corpus = load_zipped_pickle(pkl_filename) else: for item in wiki_article_categories.find(): list_category = item['categories'].strip().split('|') uri_article = item['uri'] title = findTitle(uri_article) entity_content_dict = {} doc_entity = lucene_dbpedia_fsdm.findEntityDocFromIndex( title, 'title', False) if doc_entity is None: continue for f in [ 'names', 'attributes', 'categories', 'similar_entities', 'related_entities', 'catchall' ]: entity_content_dict[f] = doc_entity[f] entity_content_dict['stemmed_' + f] = doc_entity['stemmed_' + f] if len(entity_content_dict['catchall'].strip()) == 0: continue for cat in list_category[:NUMBER_TOP_CATEGORY]: if ('<http://dbpedia.org/resource/Category:' + cat + '>') not in profile: continue if cat not in category_corpus: category_corpus[cat] = [] if len(category_corpus[cat]) < 300: category_corpus[cat].append(entity_content_dict) #cnt+=1 #if cnt>20: #break print('saving corpus to pkl.gz') save_zipped_pickle(category_corpus, pkl_filename) client.close() # begin write the data into index print('begin write into index') if system_flag == 'Windows': LUCENE_INDEX_DIR = 'mmapDirectory\\category_corpus_dbpedia201510_top' + str( NUMBER_TOP_CATEGORY) + '_fsdm3' else: LUCENE_INDEX_DIR = '%s/mmapDirectory/category_corpus_dbpedia201510_top' % ( cwd) + str(NUMBER_TOP_CATEGORY) + '_fsdm3' # backup code files cmd = 'robocopy %s %s\code_files *.py' % ( r'%cd%', LUCENE_INDEX_DIR ) if system_flag == 'Windows' else 'cp *.py %s/code_files' % ( LUCENE_INDEX_DIR) os.system(cmd) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = SimpleAnalyzer() config = IndexWriterConfig(analyzer) # write data to index w = IndexWriter(index_mm, config) cnt = 0 data = {} max_article_num = 0 stemmer = SnowballStemmer('english') for cat, list_entity_dict in category_corpus.items(): cat_label = cleanSentence(cat, True) data.clear() data['category'] = (cat, 'StringField') data['label'] = (cat_label, 'CUSTOM_FIELD_TEXT') data['stemmed_label'] = (stemSentence(cat_label, stemmer, True), 'CUSTOM_FIELD_TEXT') data['num_articles'] = (len(list_entity_dict), 'INTEGER_STORED') if data['num_articles'][0] > max_article_num: max_article_num = data['num_articles'][0] for f in [ 'names', 'attributes', 'categories', 'similar_entities', 'related_entities', 'catchall' ]: contents = cleanSentence( ' '.join([dic[f] for dic in list_entity_dict]), True, ' ') data[f] = (contents, 'CUSTOM_FIELD_TEXT_NOT_STORED') data['stemmed_' + f] = (stemSentence(contents, stemmer, False), 'CUSTOM_FIELD_TEXT_NOT_STORED') #print ('--------------------') # need to calculate corpus average length addDoc(w, data) #cnt+=1 #if cnt>20: #break w.close() print('max article num=%d' % (max_article_num))
def create_index(self, index_folder, docs_path, add_terms=False): print 'Loading Vocab...' if not self.vocab: self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words) os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(False) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) if add_terms: if prm.top_tfidf > 0 or prm.idf_path: print 'Creating IDF dictionary...' self.idf = defaultdict(int) doc_id = 0 if docs_path.lower().endswith('.hdf5'): import corpus_hdf5 corpus = corpus_hdf5.CorpusHDF5(docs_path) for txt in corpus.get_text_iter(): self.add_idf(txt) if doc_id % 1000 == 0: print 'Creating IDF, doc', doc_id doc_id += 1 else: # ClueWeb09 import warc import gzip from bs4 import BeautifulSoup # list all files in the folder. paths = [] for root, directories, filenames in os.walk(docs_path): for filename in filenames: paths.append(os.path.join(root, filename)) for path in paths: with gzip.open(path, mode='rb') as gzf: for record in warc.WARCFile(fileobj=gzf): # remove html tags txt = BeautifulSoup( record.payload[:1000 * 1000], "lxml").get_text() # remove WARC headers. txt = '\n'.join(txt.split('\n')[10:]) self.add_idf(txt) if doc_id % 1000 == 0: print 'Creating IDF, doc', doc_id doc_id += 1 for key, val in self.idf.items(): self.idf[key] = math.log(float(doc_id) / val) pkl.dump(self.idf, open(prm.idf_path, 'wb')) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig(StandardAnalyzer()) self.writer = IndexWriter(fsDir, writerConfig) print "%d docs in index" % self.writer.numDocs() print "Indexing documents..." doc_id = 0 if docs_path.lower().endswith('.hdf5'): import corpus_hdf5 corpus = corpus_hdf5.CorpusHDF5(docs_path) for txt in corpus.get_text_iter(): title = corpus.get_article_title(doc_id) self.add_doc(doc_id, title, txt, add_terms) if doc_id % 1000 == 0: print 'indexing doc', doc_id doc_id += 1 else: # ClueWeb09 import warc import gzip from bs4 import BeautifulSoup # list all files in the folder. paths = [] for root, directories, filenames in os.walk(docs_path): for filename in filenames: paths.append(os.path.join(root, filename)) for path in paths: with gzip.open(path, mode='rb') as gzf: for record in warc.WARCFile(fileobj=gzf): if 'warc-trec-id' in record: title = record['warc-trec-id'] else: title = record['warc-record-id'] # remove html tags #txt = BeautifulSoup(record.payload[:1000*1000], "lxml").get_text() txt = record.payload[:1000 * 1000] # remove WARC headers. txt = '\n'.join(txt.split('\n')[10:]) self.add_doc(doc_id, title, txt, add_terms) if doc_id % 1000 == 0: print 'indexing doc', doc_id doc_id += 1 print "Index of %d docs..." % self.writer.numDocs() self.writer.close()
class Lucene(object): # default fieldnames for id and contents FIELDNAME_ID = "id" FIELDNAME_CONTENTS = "contents" # internal fieldtypes # used as Enum, the actual values don't matter FIELDTYPE_ID = "id" FIELDTYPE_ID_TV = "id_tv" FIELDTYPE_TEXT = "text" FIELDTYPE_TEXT_TV = "text_tv" FIELDTYPE_TEXT_TVP = "text_tvp" def __init__(self, index_dir, use_ram=False, jvm_ram=None): global lucene_vm_init if not lucene_vm_init: if jvm_ram: # e.g. jvm_ram = "8g" print "Increased JVM ram" lucene.initVM(vmargs=['-Djava.awt.headless=true'], maxheap=jvm_ram) else: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True self.dir = SimpleFSDirectory(Paths.get(index_dir)) self.use_ram = use_ram if use_ram: print "Using ram directory..." self.ram_dir = RAMDirectory(self.dir, IOContext.DEFAULT) self.analyzer = None self.reader = None self.searcher = None self.writer = None self.ldf = None print "Connected to index " + index_dir def get_version(self): """Get Lucene version.""" #return Version.LUCENE_48 return lucene.VERSION def get_analyzer(self): """Get analyzer.""" if self.analyzer is None: self.analyzer = StandardAnalyzer() #self.analyzer = SmartChineseAnalyzer() return self.analyzer def open_reader(self): """Open IndexReader.""" if self.reader is None: if self.use_ram: print "reading from ram directory(%s) ..."%self.ram_dir self.reader = DirectoryReader.open(self.ram_dir) else: self.reader = DirectoryReader.open(self.dir) def get_reader(self): return self.reader def close_reader(self): """Close IndexReader.""" if self.reader is not None: self.reader.close() self.reader = None else: raise Exception("There is no open IndexReader to close") def open_searcher(self): """ Open IndexSearcher. Automatically opens an IndexReader too, if it is not already open. There is no close method for the searcher. """ if self.searcher is None: self.open_reader() self.searcher = IndexSearcher(self.reader) def get_searcher(self): """Returns index searcher (opens it if needed).""" self.open_searcher() return self.searcher def open_writer(self): """Open IndexWriter.""" if self.writer is None: config = IndexWriterConfig(self.get_analyzer()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.dir, config) else: raise Exception("IndexWriter is already open") def close_writer(self): """Close IndexWriter.""" if self.writer is not None: self.writer.close() self.writer = None else: raise Exception("There is no open IndexWriter to close") def add_document(self, contents): """ Adds a Lucene document with the specified contents to the index. See LuceneDocument.create_document() for the explanation of contents. """ if self.ldf is None: # create a single LuceneDocument object that will be reused self.ldf = LuceneDocument() self.writer.addDocument(self.ldf.create_document(contents)) def get_lucene_document_id(self, doc_id): """Loads a document from a Lucene index based on its id.""" self.open_searcher() query = TermQuery(Term(self.FIELDNAME_ID, doc_id)) tophit = self.searcher.search(query, 1).scoreDocs if len(tophit) == 1: return tophit[0].doc else: return None def get_document_id(self, lucene_doc_id): """Gets lucene document id and returns the document id.""" self.open_reader() return self.reader.document(lucene_doc_id).get(self.FIELDNAME_ID) def get_id_lookup_query(self, id, field=None): """Creates Lucene query for searching by (external) document id """ if field is None: field = self.FIELDNAME_ID return TermQuery(Term(field, id)) def get_and_query(self, queries): """Creates an AND Boolean query from multiple Lucene queries """ # empty boolean query with Similarity.coord() disabled bq_builder = BooleanQuery.Builder() for q in queries: bq_builder.add(q, BooleanClause.Occur.MUST) bq = bq_builder.build() return bq def get_or_query(self, queries): """Creates an OR Boolean query from multiple Lucene queries """ # empty boolean query with Similarity.coord() disabled bq_builder = BooleanQuery.Builder() for q in queries: bq_builder.add(q, BooleanClause.Occur.SHOULD) bq = bq_builder.build() return bq def get_phrase_query(self, query, field): """Creates phrase query for searching exact phrase.""" terms = [] trans_query = '' #process Chinese query for c in query: if ord(c) >=256: trans_query += '%s '%c else: trans_query += c for t in trans_query.split(): #term = Term(field, t) #terms.append(term) terms.append(t) #phq.add(Term(field, t)) phq = PhraseQuery(field, terms) return phq def num_docs(self): """Returns number of documents in the index.""" self.open_reader() return self.reader.numDocs()
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.store import RAMDirectory from org.apache.lucene.document import Document, Field, FieldType from org.apache.lucene.util import BytesRef, BytesRefIterator, Version from org.apache.lucene.index import \ IndexWriterConfig, IndexWriter, DirectoryReader if __name__ == '__main__': lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = RAMDirectory() iconfig = IndexWriterConfig( Version.LUCENE_CURRENT, LimitTokenCountAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT), 100)) iwriter = IndexWriter(directory, iconfig) ft = FieldType() ft.setIndexed(True) ft.setStored(True) ft.setTokenized(True) ft.setStoreTermVectors(True) ft.setStoreTermVectorOffsets(True) ft.setStoreTermVectorPositions(True) ts = [ "this bernhard is the text to be index text", "this claudia is the text to be indexed" ] for t in ts: doc = Document()
def open_writer(self): config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.directory, config)
def main(): if len(sys.argv) < 2: print('error: too few arguments') print('command: python build_index_wikipedia.py FILENAME') quit() # create file object filename = sys.argv[1] print('processing ' + filename) cnt = 0 stemmer = SnowballStemmer('english') try: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True except: print('JavaVM already running') LUCENE_INDEX_DIR = 'mmapDirectory\\index_wikipedia_2015' is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) #config=config.setRAMBufferSizeMB(1024.0) # experimental setting !! # write data to index if not is_index_Exist: print('begin backup code files') system_flag = platform.system() cmd = 'robocopy %s %s\code_files *.py' % ( r'%cd%', LUCENE_INDEX_DIR ) if system_flag == 'Windows' else 'cp *.py %s\code_files' % ( LUCENE_INDEX_DIR) os.system(cmd) w = IndexWriter(index_mm, config) data = {} with open(filename, 'r', encoding='utf-8') as src: for page_pair in extract_pages(src): label, content, page_id = page_pair[0], page_pair[1], page_pair[2] pair_tokens = process_article((content, False, label, page_id)) content = remove_stopwords(' '.join(pair_tokens[0]), ' ') if len(content.split()) < 10: continue stemmed_content = stemSentence(content, stemmer, False) if DEBUG_MODE == True: try: print('%s\n%s\n%s\n%s' % (label, page_id, content, stemmed_content)) except: print('encoding error') data.clear() data['label'] = (label, 'StringField') data['label_lower'] = (label.lower(), 'StringField') data['label_lower_text'] = (label.lower(), 'TextField') data['wiki_id'] = (page_id, 'StringField') #data['content']=(content,'TextField') data['stemmed_content'] = (stemmed_content, 'TextField') addDoc(w, data) cnt += 1 #if cnt>20: #break if cnt % 5000 == 0: print('finish %d' % (cnt)) w.close()
def create_document(file_name): path = './alldocs/' + file_name file = open(path) doc = Document() doc.add(StringField("title", input_file, Field.Store.YES)) doc.add(TextField("text", file.read(), Field.Store.YES)) file.close() return doc # Initialize lucene and the JVM lucene.initVM() directory = RAMDirectory() analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, NoT) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) writer = IndexWriter(directory, config) print "Number of indexed documents: %d\n" % writer.numDocs() for input_file in listdir(INPUT_DIR): print "Current file:", input_file doc = create_document(input_file) writer.addDocument(doc) print "\nNumber of indexed documents: %d" % writer.numDocs() writer.close() print "Indexing done!\n" print "------------------------------------------------------"