def create_index(self, index_folder): os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(True) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig( MySimpleAnalyzer( CharArraySet(collections.JavaSet(utils.STOPWORDS), True))) writerConfig.setSimilarity(MyTFIDFSimilarity()) writerConfig.setRAMBufferSizeMB(16384.0) # 14g self.writer = IndexWriter(fsDir, writerConfig) logger.info(f"{self.writer.numDocs()} docs in index") logger.info("Indexing documents...") doc_ids = self.doc_db.get_doc_ids() for doc_id in tqdm(doc_ids, total=len(doc_ids)): text = self.doc_db.get_doc_text(doc_id) tokens = self.doc_db.get_doc_tokens(doc_id) self.add_doc(doc_id, text, tokens) logger.info(f"Indexed {self.writer.numDocs()} docs.") self.writer.forceMerge(1) # to increase search performance self.writer.close()
def _getLucene(self, path): directory = FSDirectory.open(Paths.get(path)) config = IndexWriterConfig(None) config.setRAMBufferSizeMB(256.0) # faster config.setUseCompoundFile(False) # faster, for Lucene 4.4 and later writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher
def _getLucene(self, path): directory = FSDirectory.open(Paths.get(path)) config = IndexWriterConfig(None) config.setRAMBufferSizeMB(256.0) # faster config.setUseCompoundFile(False) # faster, for Lucene 4.4 and later writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher
def main(): try: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True except: print('JavaVM already running') is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = SimpleAnalyzer() config = IndexWriterConfig(analyzer) config = config.setRAMBufferSizeMB(1024.0) # write data to index if not is_index_Exist: print('begin backup code files') system_flag = platform.system() cmd = 'robocopy %s %s\code_files *.py' % ( r'%cd%', LUCENE_INDEX_DIR ) if system_flag == 'Windows' else 'cp -f *.py %s\code_files' % ( LUCENE_INDEX_DIR) os.system(cmd) w = IndexWriter(index_mm, config) makeIndex(w) w.close() else: print('index already exists, stop indexing')
class Indexer(object): # 建立索引 def __init__(self, indexDir, doClear=True, computeLengthNorm=False): # if not jpype.isJVMStarted(): # lucene.initVM() lucene.getVMEnv().attachCurrentThread() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 100678)#is here? self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config.setRAMBufferSizeMB(256.0) # 设置自动提交的最大RAM为256MB self.config.setMaxBufferedDocs(10000) # 设置自动提交的最大Docs个数为10000 if not computeLengthNorm: sim = CustomSimilarity() self.config.setSimilarity(sim) self.path = os.path.join(INDEX_PATH, indexDir) # print self.path # path.mkdir(self.path) # if doClear: # self.clearExistingIndex() self.store = SimpleFSDirectory(File(self.path)) self.writer = IndexWriter(self.store, self.config) self.t1 = FieldType() # 域t1 self.t1.setIndexed(True) self.t1.setStored(True) self.t1.setTokenized(False) self.t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) self.t2 = FieldType() # 域t2 self.t2.setIndexed(True) self.t2.setStored(False) self.t2.setTokenized(True) self.t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) def clearExistingIndex(self): # 删除索引? indexdir = self.path for thefile in os.listdir(indexdir): filepath = os.path.join(indexdir, thefile) try: if os.path.isfile(filepath): os.unlink(filepath) except Exception, e: logger.error("Delete file %s failed: %s", filepath, str(e))
def _create_index(self, index_dir: str) -> None: """Index documents Parameters ---------- index_dir : str The dir to store index """ os.mkdir(index_dir) TITLE_FIELD = FieldType() # pylint: disable=invalid-name TITLE_FIELD.setStored(True) TITLE_FIELD.setIndexOptions(IndexOptions.DOCS) TEXT_FIELD = FieldType() # pylint: disable=invalid-name TEXT_FIELD.setStored(True) TEXT_FIELD.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) fs_dir = MMapDirectory(Paths.get(index_dir)) writer_config = IndexWriterConfig(StandardAnalyzer()) writer_config.setRAMBufferSizeMB(16384.0) # 14g self.writer = IndexWriter(fs_dir, writer_config) logger.info("%d docs in index", self.writer.numDocs()) logger.info("Indexing documents...") doc_ids = self.doc_db.get_doc_ids() for doc_id in tqdm(doc_ids, total=len(doc_ids)): text = self.doc_db.get_doc_text(doc_id) doc = Document() doc.add(Field("title", doc_id, TITLE_FIELD)) doc.add(Field("text", text, TEXT_FIELD)) self.writer.addDocument(doc) logger.info("Indexed %d docs.", self.writer.numDocs()) self.writer.forceMerge(1) # to increase search performance self.writer.close()
def __init__(self, lang, dataset, analyzer, index_path=None, data_path=None, ram_size=2048): """ Returns scored documents in multiple languages. Parameters: dataset (str): ['mlqa_dev', 'mlqa_test', 'wiki'] lang (str): ['en', 'es', 'de'] anlyzer (str): ['en', 'es', 'de', 'standard'] ram_size (int): Size of memory used while indexing Returns: """ super().__init__() idxdir = self.get_index(lang, dataset, index_path) self.mlqa = True if dataset == 'mlqa_dev': self.dataset = MLQADataset('dev', lang, lang, data_path) elif dataset == 'mlqa_test': self.dataset = MLQADataset('test', lang, lang, data_path) elif dataset == 'wiki': self.mlqa = False self.dataset = Wiki(lang, data_path) else: raise RuntimeError("No dataloader for {}".format(dataset)) # stores index files, poor concurency try NIOFSDirectory instead store = SimpleFSDirectory(Paths.get(idxdir)) # limit max. number of tokens per document. # analyzer will not consume more tokens than that #analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) # configuration for index writer config = IndexWriterConfig(analyzers[analyzer]()) # creates or overwrites index config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) # setting similarity BM25Similarity(k1=1.2,b=0.75) similarity = BM25Similarity(self.k1, self.b) config.setSimilarity(similarity) config.setRAMBufferSizeMB(float(ram_size)) # create index writer self.writer = IndexWriter(store, config) self.ftdata = FieldType() self.ftmeta = FieldType() # IndexSearcher will return value of the field self.ftdata.setStored(True) self.ftmeta.setStored(True) # will be analyzed by Analyzer self.ftdata.setTokenized(True) self.ftmeta.setTokenized(False) # what informations are stored (probabli DOCS would be sufficient) # DOCS: Only documents are indexed: term frequencies and positions are omitted. # Phrase and other positional queries on the field will throw an exception, # and scoring will behave as if any term in the document appears only once. # DOCS_AND_FREQS: Only documents and term frequencies are indexed: positions are # omitted. This enables normal scoring, except Phrase and other positional # queries will throw an exception. # DOCS_AND_FREQS_AND_POSITIONS: Indexes documents, frequencies and positions. # This is a typical default for full-text search: full scoring is enabled # and positional queries are supported. self.ftdata.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) self.ftmeta.setIndexOptions(IndexOptions.DOCS) # instantiate some reusable objects # TODO: create document, add fields then change only field value and # re-add document self.doc = Document() # Id cannot be reused because there is multiple values # I could store list of fields and add one if its not enough #self.fieldId = Field("id", "dummy", self.ftmeta) self.fieldTitle = Field("title", "dummy", self.ftdata) self.doc.add(self.fieldTitle) self.fieldContext = Field("context", "dummy", self.ftdata) self.doc.add(self.fieldContext) self.fieldIds = [Field("id", "dummy", self.ftmeta)]