def __init__(self): self.env = lucene.initVM(initialheap='28g', maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.vocab = None BooleanQuery.setMaxClauseCount(2048) if not os.path.exists(prm.index_folder): print 'Creating index at', prm.index_folder if prm.docs_path == prm.docs_path_term: add_terms = True else: add_terms = False self.create_index(prm.index_folder, prm.docs_path, add_terms) if prm.local_index_folder: print 'copying index from', prm.index_folder, 'to', prm.local_index_folder if os.path.exists(prm.local_index_folder): print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.' else: shutil.copytree(prm.index_folder, prm.local_index_folder) self.index_folder = prm.local_index_folder else: self.index_folder = prm.index_folder fsDir = MMapDirectory(Paths.get(prm.index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) if prm.docs_path != prm.docs_path_term: if not os.path.exists(prm.index_folder_term): print 'Creating index at', prm.index_folder_term self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True) if prm.local_index_folder_term: print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term if os.path.exists(prm.local_index_folder_term): print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.' else: shutil.copytree(prm.index_folder_term, prm.local_index_folder_term) self.index_folder_term = prm.local_index_folder_term else: self.index_folder_term = prm.index_folder_term fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term)) self.searcher_term = IndexSearcher( DirectoryReader.open(fsDir_term)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=prm.n_threads) self.cache = {} print 'Loading Title-ID mapping...' self.title_id_map, self.id_title_map = self.get_title_id_map()
def __init__(self, path, settings): self._settings = settings self._multithreaded = settings.multithreaded self._checker = DirectSpellChecker() indexDirectory = MMapDirectory(File(join(path, 'index'))) indexDirectory.setUseUnmap(False) taxoDirectory = MMapDirectory(File(join(path, 'taxo'))) taxoDirectory.setUseUnmap(False) conf = IndexWriterConfig(Version.LUCENE_4_10_0, settings.analyzer) conf.setSimilarity(settings.similarity) mergePolicy = TieredMergePolicy() mergePolicy.setMaxMergeAtOnce(settings.maxMergeAtOnce) mergePolicy.setSegmentsPerTier(settings.segmentsPerTier) conf.setMergePolicy(mergePolicy) if not settings.readonly: self._indexWriter = IndexWriter(indexDirectory, conf) self._indexWriter.commit() self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(settings.lruTaxonomyWriterCacheSize)) self._taxoWriter.commit() self._indexAndTaxonomy = IndexAndTaxonomy(settings, indexDirectory, taxoDirectory) self._readerSettingsWrapper = self._indexAndTaxonomy._readerSettingsWrapper self._facetsConfig = settings.fieldRegistry.facetsConfig self._ordinalsReader = CachedOrdinalsReader(DocValuesOrdinalsReader())
def main(): LUCENE_INDEX_DIR = 'mmapDirectory/trec_v15_wikipedia_stemmed_v2' try: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True except: print('JavaVM already running') is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) #config=config.setRAMBufferSizeMB(1024.0) # experimental setting !! # write data to index if not is_index_Exist: #if True: print('begin backup code files') system_flag = platform.system() if system_flag == 'Windows': os.system('robocopy %s %s\code_files *.py' % (r'%cd%', LUCENE_INDEX_DIR)) else: os.system('mkdir %s/code_files' % (LUCENE_INDEX_DIR)) os.system('cp *.py %s/code_files' % (LUCENE_INDEX_DIR)) w = IndexWriter(index_mm, config) makeIndex(w) w.close() else: print('index already exists, stop indexing')
def create_index(self, index_folder): os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(True) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig( MySimpleAnalyzer( CharArraySet(collections.JavaSet(utils.STOPWORDS), True))) writerConfig.setSimilarity(MyTFIDFSimilarity()) writerConfig.setRAMBufferSizeMB(16384.0) # 14g self.writer = IndexWriter(fsDir, writerConfig) logger.info(f"{self.writer.numDocs()} docs in index") logger.info("Indexing documents...") doc_ids = self.doc_db.get_doc_ids() for doc_id in tqdm(doc_ids, total=len(doc_ids)): text = self.doc_db.get_doc_text(doc_id) tokens = self.doc_db.get_doc_tokens(doc_id) self.add_doc(doc_id, text, tokens) logger.info(f"Indexed {self.writer.numDocs()} docs.") self.writer.forceMerge(1) # to increase search performance self.writer.close()
def __init__(self, LUCENE_INDEX_DIR, similarity='BM25', lucene_vm_flag=False, is_bigram_cache_used=False, mongoObj=None): if lucene_vm_flag == False: lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.lucene_vm_init = True self.index_dir = LUCENE_INDEX_DIR self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) self.analyzer = SimpleAnalyzer() self.config = IndexWriterConfig(self.analyzer) self.reader = DirectoryReader.open(self.index_mm) self.searchers = [] self.searchers.append(IndexSearcher(self.reader)) if similarity == 'BM25': (self.searchers[0]).setSimilarity(BM25Similarity()) # load bigram cache self.is_bigram_cache_used = is_bigram_cache_used if is_bigram_cache_used == True: seperate_char = '/' if self.index_dir.find('/') > -1 else '\\' index_name = self.index_dir.split(seperate_char)[-1] self.index_name = index_name self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache'] self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache'] if 'stemmed_wikipedia' in LIST_F or 'wikipedia' in LIST_F: self.conn_mapping_prob_cache = mongoObj.db[ index_name + '_mapping_prob_cache_with_wikipedia'] else: self.conn_mapping_prob_cache = mongoObj.db[ index_name + '_mapping_prob_cache']
def __init__(self, LUCENE_INDEX_DIR, similarity='BM25', lucene_vm_flag=False, is_bigram_cache_used=False, mongoObj=None): if lucene_vm_flag == False: lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.lucene_vm_init = True self.index_dir = LUCENE_INDEX_DIR self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) #self.analyzer = StandardAnalyzer() self.analyzer = SimpleAnalyzer() self.config = IndexWriterConfig(self.analyzer) self.reader = DirectoryReader.open(self.index_mm) self.searcher = IndexSearcher(self.reader) self.dict_term_freq = {} if similarity == 'BM25': (self.searcher).setSimilarity(BM25Similarity()) # load bigram cache self.is_bigram_cache_used = is_bigram_cache_used if is_bigram_cache_used == True: seperate_char = '/' if self.index_dir.find('/') > -1 else '\\' index_name = self.index_dir.split(seperate_char)[-1] self.index_name = index_name self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache'] self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache']
def main(): try: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True except: print('JavaVM already running') is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = SimpleAnalyzer() config = IndexWriterConfig(analyzer) config = config.setRAMBufferSizeMB(1024.0) # write data to index if not is_index_Exist: print('begin backup code files') system_flag = platform.system() cmd = 'robocopy %s %s\code_files *.py' % ( r'%cd%', LUCENE_INDEX_DIR ) if system_flag == 'Windows' else 'cp -f *.py %s\code_files' % ( LUCENE_INDEX_DIR) os.system(cmd) w = IndexWriter(index_mm, config) makeIndex(w) w.close() else: print('index already exists, stop indexing')
def main(): global lucene_vm_init if not lucene_vm_init: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) #index_mm = SimpleFSDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) #config.setCodec(lucene50) #config.setSimilarity(BM25Similarity()) # load index to search engine #reader = DirectoryReader.open(index_mm) #searcher1 = IndexSearcher(reader) #searcher1.setSimilarity(BM25Similarity()) #searcher2 = IndexSearcher(reader) #w = IndexWriter(index_mm,config) #upgrader = IndexUpgrader(index_mm,config,True) upgrader = IndexUpgrader(index_mm) print 'begin to upgrade' upgrader.upgrade() # read query #read_query() # initialize mongodb client #mongoObj=Mongo_Object('localhost',27017) print 'finish upgrade'
def create_index(self, index_folder, docs_path, add_terms=False): os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(False) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig(StandardAnalyzer()) self.writer = IndexWriter(fsDir, writerConfig) print "%d docs in index" % self.writer.numDocs() print "Indexing documents..." doc_id = 0 import corpus_hdf5 corpus = corpus_hdf5.CorpusHDF5(docs_path) for txt in corpus.get_text_iter(): title = corpus.get_article_title(doc_id) self.add_doc(doc_id, title, txt, add_terms) if doc_id % 1000 == 0: print 'indexing doc', doc_id doc_id += 1 print "Index of %d docs..." % self.writer.numDocs() self.writer.close()
def search(command): STORE_DIR = "index" # base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = MMapDirectory(Paths.get(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = SmartChineseAnalyzer() ans = run(searcher, analyzer, command) del searcher return ans # vm_env = lucene.initVM(vmargs=['-Djava.awt.headless=true']) # for y in search('二三四五'): # print(y)
def __init__(self, LUCENE_INDEX_DIR, similarity='BM25', lucene_vm_flag=False): if lucene_vm_flag == False: lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.lucene_vm_init = True self.index_dir = LUCENE_INDEX_DIR self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) self.analyzer = StandardAnalyzer() self.config = IndexWriterConfig(self.analyzer) self.reader = DirectoryReader.open(self.index_mm) self.searcher = IndexSearcher(self.reader) self.dict_term_freq = {} self.dict_doc_field_title = {} if similarity == 'BM25': (self.searcher).setSimilarity(BM25Similarity())
def __init__(self, args): self.env = lucene.initVM(initialheap='28g', maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.args = args index_folder = os.path.join(DATA_DIR, args.index_folder) if not os.path.exists(index_folder): self.doc_db = DocDB() logger.info(f'Creating index at {index_folder}') self.create_index(index_folder) fsDir = MMapDirectory(Paths.get(index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) self.searcher.setSimilarity(MyTFIDFSimilarity()) self.analyzer = MySimpleAnalyzer( CharArraySet(collections.JavaSet(utils.STOPWORDS), True)) self.pool = ThreadPool(processes=args.num_search_workers)
def create_index(self, index_folder, docs_path, add_terms=False): print('Loading Vocab...') if not self.vocab: self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words) os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(False) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig(StandardAnalyzer()) self.writer = IndexWriter(fsDir, writerConfig) print("%d docs in index" % self.writer.numDocs()) print("Indexing documents...") # import corpus_hdf5 # corpus = corpus_hdf5.MSMARCOCorpusHDF5(docs_path) import pickle with open(docs_path, "rb") as read_file: corpus = pickle.load(read_file) idx_cnt = 0 # for doc_id, txt in zip(corpus.get_id_iter(), corpus.get_text_iter()): # for doc_id, txt in corpus.items(): for txt in corpus: self.add_doc(idx_cnt, txt, add_terms) # not lowered if idx_cnt % 1000 == 0: print('indexing doc', idx_cnt) idx_cnt += 1 print("Index of %d docs..." % self.writer.numDocs()) self.writer.close()
def _create_index(self, index_dir: str) -> None: """Index documents Parameters ---------- index_dir : str The dir to store index """ os.mkdir(index_dir) TITLE_FIELD = FieldType() # pylint: disable=invalid-name TITLE_FIELD.setStored(True) TITLE_FIELD.setIndexOptions(IndexOptions.DOCS) TEXT_FIELD = FieldType() # pylint: disable=invalid-name TEXT_FIELD.setStored(True) TEXT_FIELD.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) fs_dir = MMapDirectory(Paths.get(index_dir)) writer_config = IndexWriterConfig(StandardAnalyzer()) writer_config.setRAMBufferSizeMB(16384.0) # 14g self.writer = IndexWriter(fs_dir, writer_config) logger.info("%d docs in index", self.writer.numDocs()) logger.info("Indexing documents...") doc_ids = self.doc_db.get_doc_ids() for doc_id in tqdm(doc_ids, total=len(doc_ids)): text = self.doc_db.get_doc_text(doc_id) doc = Document() doc.add(Field("title", doc_id, TITLE_FIELD)) doc.add(Field("text", text, TEXT_FIELD)) self.writer.addDocument(doc) logger.info("Indexed %d docs.", self.writer.numDocs()) self.writer.forceMerge(1) # to increase search performance self.writer.close()
def __init__(self, index_dir: str, db_path: str = None, num_search_workers: int = 8) -> None: self.env = lucene.getVMEnv() # pylint: disable=no-member if not self.env: self.env = lucene.initVM( initialheap='28g', # pylint: disable=no-member maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.num_search_workers = num_search_workers if not os.path.exists(index_dir): self.doc_db = DocDB(db_path=db_path) logger.info('Creating index at %s', index_dir) self._create_index(index_dir) fs_dir = MMapDirectory(Paths.get(index_dir)) self.searcher = IndexSearcher(DirectoryReader.open(fs_dir)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=num_search_workers)
def main(): if len(sys.argv) < 2: print('error: too few arguments') print('command: python build_index_wikipedia.py FILENAME') quit() # create file object filename = sys.argv[1] print('processing ' + filename) cnt = 0 stemmer = SnowballStemmer('english') try: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True except: print('JavaVM already running') LUCENE_INDEX_DIR = 'mmapDirectory\\index_wikipedia_2015' is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) #config=config.setRAMBufferSizeMB(1024.0) # experimental setting !! # write data to index if not is_index_Exist: print('begin backup code files') system_flag = platform.system() cmd = 'robocopy %s %s\code_files *.py' % ( r'%cd%', LUCENE_INDEX_DIR ) if system_flag == 'Windows' else 'cp *.py %s\code_files' % ( LUCENE_INDEX_DIR) os.system(cmd) w = IndexWriter(index_mm, config) data = {} with open(filename, 'r', encoding='utf-8') as src: for page_pair in extract_pages(src): label, content, page_id = page_pair[0], page_pair[1], page_pair[2] pair_tokens = process_article((content, False, label, page_id)) content = remove_stopwords(' '.join(pair_tokens[0]), ' ') if len(content.split()) < 10: continue stemmed_content = stemSentence(content, stemmer, False) if DEBUG_MODE == True: try: print('%s\n%s\n%s\n%s' % (label, page_id, content, stemmed_content)) except: print('encoding error') data.clear() data['label'] = (label, 'StringField') data['label_lower'] = (label.lower(), 'StringField') data['label_lower_text'] = (label.lower(), 'TextField') data['wiki_id'] = (page_id, 'StringField') #data['content']=(content,'TextField') data['stemmed_content'] = (stemmed_content, 'TextField') addDoc(w, data) cnt += 1 #if cnt>20: #break if cnt % 5000 == 0: print('finish %d' % (cnt)) w.close()
def create_index(self, index_folder, docs_path, add_terms=False): print 'Loading Vocab...' if not self.vocab: self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words) os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(False) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) if add_terms: if prm.top_tfidf > 0 or prm.idf_path: print 'Creating IDF dictionary...' self.idf = defaultdict(int) doc_id = 0 if docs_path.lower().endswith('.hdf5'): import corpus_hdf5 corpus = corpus_hdf5.CorpusHDF5(docs_path) for txt in corpus.get_text_iter(): self.add_idf(txt) if doc_id % 1000 == 0: print 'Creating IDF, doc', doc_id doc_id += 1 else: # ClueWeb09 import warc import gzip from bs4 import BeautifulSoup # list all files in the folder. paths = [] for root, directories, filenames in os.walk(docs_path): for filename in filenames: paths.append(os.path.join(root, filename)) for path in paths: with gzip.open(path, mode='rb') as gzf: for record in warc.WARCFile(fileobj=gzf): # remove html tags txt = BeautifulSoup( record.payload[:1000 * 1000], "lxml").get_text() # remove WARC headers. txt = '\n'.join(txt.split('\n')[10:]) self.add_idf(txt) if doc_id % 1000 == 0: print 'Creating IDF, doc', doc_id doc_id += 1 for key, val in self.idf.items(): self.idf[key] = math.log(float(doc_id) / val) pkl.dump(self.idf, open(prm.idf_path, 'wb')) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig(StandardAnalyzer()) self.writer = IndexWriter(fsDir, writerConfig) print "%d docs in index" % self.writer.numDocs() print "Indexing documents..." doc_id = 0 if docs_path.lower().endswith('.hdf5'): import corpus_hdf5 corpus = corpus_hdf5.CorpusHDF5(docs_path) for txt in corpus.get_text_iter(): title = corpus.get_article_title(doc_id) self.add_doc(doc_id, title, txt, add_terms) if doc_id % 1000 == 0: print 'indexing doc', doc_id doc_id += 1 else: # ClueWeb09 import warc import gzip from bs4 import BeautifulSoup # list all files in the folder. paths = [] for root, directories, filenames in os.walk(docs_path): for filename in filenames: paths.append(os.path.join(root, filename)) for path in paths: with gzip.open(path, mode='rb') as gzf: for record in warc.WARCFile(fileobj=gzf): if 'warc-trec-id' in record: title = record['warc-trec-id'] else: title = record['warc-record-id'] # remove html tags #txt = BeautifulSoup(record.payload[:1000*1000], "lxml").get_text() txt = record.payload[:1000 * 1000] # remove WARC headers. txt = '\n'.join(txt.split('\n')[10:]) self.add_doc(doc_id, title, txt, add_terms) if doc_id % 1000 == 0: print 'indexing doc', doc_id doc_id += 1 print "Index of %d docs..." % self.writer.numDocs() self.writer.close()
def main(): global lucene_vm_init if not lucene_vm_init: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) # load index to search engine reader = DirectoryReader.open(index_mm) searcher = IndexSearcher(reader) searcher.setSimilarity(BM25Similarity()) # read query read_query() # initialize mongodb client mongoObj = Mongo_Object('localhost', 27017) # initialize word2vec print 'load word2vec model' w2vmodel = gensim.models.Word2Vec.load_word2vec_format( "F:\\modified_w2v\\w2v_wiki_trigram_phrase_20170101\\wiki.en.text.vector.binary", binary=True) print 'finish loading word2vec model' # search global hitsPerPage fields = ['name', 'value'] #parser=MultiFieldQueryParser(fields,analyzer) #parser.setDefaultOperator(QueryParserBase.AND_OPERATOR) rec_result = open('pylucene.runs', 'w') for i in range(len(queries)): query = queries[i] print 'processing query ' + str(i) + ':' + query[0] querystr = remove_duplicate(stemSentence(query[1])) #q_lucene=MultiFieldQueryParser.parse(parser,querystr) q_lucene = QueryParser("all_text", analyzer).parse(querystr) print "q_lucene: " + q_lucene.toString() collector = TopScoreDocCollector.create(hitsPerPage) searcher.search(q_lucene, collector) hits = collector.topDocs().scoreDocs # build query object for computeScore #queryObj=Query_Object(query,mongoObj,w2vmodel) # initialize duplicate remover docDup = set() # find candidate results after 1st round filter candidates = PriorityQueue() for j in xrange(len(hits)): docID = hits[j].doc d = searcher.doc(docID) name = cleanSentence(d['title'].strip()) if name in docDup: continue docDup.add(name) # build entity object entityObj = Entity_Object(d, mongoObj, w2vmodel) #score = computeScore(queryObj,entityObj,mongoObj,w2vmodel) score = hits[j].score candidates.put((-score, j)) # output results from priority queue larger score first rank = 0 while candidates.empty() == False and rank < 100: rank = rank + 1 item = candidates.get() score = -item[0] j = item[1] # index of hits[] docID = hits[j].doc d = searcher.doc(docID) title = '<dbpedia:' + d.get('title') + '>' res_line = query[0] + '\t' + 'Q0' + '\t' + title + '\t' + str( rank) + '\t' + str(score) + '\t' + 'pylucene_multifield' rec_result.writelines(res_line + '\n') rec_result.close()
def main(): if len(sys.argv) < 2: print('error: too few arguments') print('command: python create_category_corpus.py NUMBER_TOP_CATEGORY') quit() NUMBER_TOP_CATEGORY = int(sys.argv[1]) print('NUMBER_TOP_CATEGORY=%d' % (NUMBER_TOP_CATEGORY)) print('loading category profiles') profile = load_zipped_pickle('category_profiles_dbpedia_201510.gz') print('finish loading category profiles') system_flag = platform.system() cwd = os.getcwd() # initialize mongo client if system_flag == 'Windows': client = pymongo.MongoClient("localhost", 27017) else: client = pymongo.MongoClient("localhost", 58903) db = client.wiki2015 wiki_article_categories = db['article_categories'] category_corpus = {} pkl_filename = 'category_dbpedia_corpus_top%d_fsdm3.pkl.gz' % ( NUMBER_TOP_CATEGORY) if system_flag == 'Windows': lucene_dbpedia_fsdm = Lucene_Object('mmapDirectory\\dbpedia_v2_FSDM3', 'BM25', True) else: lucene_dbpedia_fsdm = Lucene_Object( '%s/mmapDirectory/dbpedia_v2_FSDM3' % (cwd), 'BM25', True) cnt = 0 if os.path.exists(pkl_filename) == True: #if False==True: print('loading category corpus') category_corpus = load_zipped_pickle(pkl_filename) else: for item in wiki_article_categories.find(): list_category = item['categories'].strip().split('|') uri_article = item['uri'] title = findTitle(uri_article) entity_content_dict = {} doc_entity = lucene_dbpedia_fsdm.findEntityDocFromIndex( title, 'title', False) if doc_entity is None: continue for f in [ 'names', 'attributes', 'categories', 'similar_entities', 'related_entities', 'catchall' ]: entity_content_dict[f] = doc_entity[f] entity_content_dict['stemmed_' + f] = doc_entity['stemmed_' + f] if len(entity_content_dict['catchall'].strip()) == 0: continue for cat in list_category[:NUMBER_TOP_CATEGORY]: if ('<http://dbpedia.org/resource/Category:' + cat + '>') not in profile: continue if cat not in category_corpus: category_corpus[cat] = [] if len(category_corpus[cat]) < 300: category_corpus[cat].append(entity_content_dict) #cnt+=1 #if cnt>20: #break print('saving corpus to pkl.gz') save_zipped_pickle(category_corpus, pkl_filename) client.close() # begin write the data into index print('begin write into index') if system_flag == 'Windows': LUCENE_INDEX_DIR = 'mmapDirectory\\category_corpus_dbpedia201510_top' + str( NUMBER_TOP_CATEGORY) + '_fsdm3' else: LUCENE_INDEX_DIR = '%s/mmapDirectory/category_corpus_dbpedia201510_top' % ( cwd) + str(NUMBER_TOP_CATEGORY) + '_fsdm3' # backup code files cmd = 'robocopy %s %s\code_files *.py' % ( r'%cd%', LUCENE_INDEX_DIR ) if system_flag == 'Windows' else 'cp *.py %s/code_files' % ( LUCENE_INDEX_DIR) os.system(cmd) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = SimpleAnalyzer() config = IndexWriterConfig(analyzer) # write data to index w = IndexWriter(index_mm, config) cnt = 0 data = {} max_article_num = 0 stemmer = SnowballStemmer('english') for cat, list_entity_dict in category_corpus.items(): cat_label = cleanSentence(cat, True) data.clear() data['category'] = (cat, 'StringField') data['label'] = (cat_label, 'CUSTOM_FIELD_TEXT') data['stemmed_label'] = (stemSentence(cat_label, stemmer, True), 'CUSTOM_FIELD_TEXT') data['num_articles'] = (len(list_entity_dict), 'INTEGER_STORED') if data['num_articles'][0] > max_article_num: max_article_num = data['num_articles'][0] for f in [ 'names', 'attributes', 'categories', 'similar_entities', 'related_entities', 'catchall' ]: contents = cleanSentence( ' '.join([dic[f] for dic in list_entity_dict]), True, ' ') data[f] = (contents, 'CUSTOM_FIELD_TEXT_NOT_STORED') data['stemmed_' + f] = (stemSentence(contents, stemmer, False), 'CUSTOM_FIELD_TEXT_NOT_STORED') #print ('--------------------') # need to calculate corpus average length addDoc(w, data) #cnt+=1 #if cnt>20: #break w.close() print('max article num=%d' % (max_article_num))
sys.exit(1) try: print("creating index:", fname) IndexFiles(sys.argv[1], fname, analyzer, not opts.all_line) end = datetime.now() print(end - start) except Exception as e: print("Failed: ", e) raise e print("loading index:", fname) else: print("creating index...") if opts.simple_fs: directory = SimpleFSDirectory(File(fname)) else: directory = MMapDirectory.open(File(fname)) dir_reader = DirectoryReader.open(directory) searcher = IndexSearcher(dir_reader) index = Index(searcher, analyzer, opts.verbose) if opts.server: from . import server server.run(opts.port, index) else: index.prompt(opts.max_n) del searcher
def openStore(self): return MMapDirectory(Paths.get(self.STORE_DIR))
def openStore(self): return MMapDirectory(File(self.STORE_DIR))
def main(): global lucene_vm_init if not lucene_vm_init: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) # load index to search engine reader = DirectoryReader.open(index_mm) searcher1 = IndexSearcher(reader) searcher1.setSimilarity(BM25Similarity()) searcher2 = IndexSearcher(reader) w = IndexWriter(index_mm,config) # read query read_query() # initialize mongodb client mongoObj=Mongo_Object('localhost',27017) # search docDup=set() finalDup={} for i in xrange(len(queries)): print 'process query %d' %(i) query = queries[i] querystr = stemSentence(query[3]) # build searcher q_lucene = QueryParser("all_text", analyzer).parse(querystr) collector = TopScoreDocCollector.create(hitsPerPage); searcher1.search(q_lucene, collector); hits = collector.topDocs().scoreDocs; # find candidate results after 1st round filter docDup.clear() for j in xrange(len(hits)): docID=hits[j].doc d=searcher1.doc(docID) if d['title'] in docDup: finalDup[d['title']]=d continue docDup.add(d['title']) docDup.clear() for j in xrange(len(hits)): docID=hits[j].doc d=searcher1.doc(docID) title=d['title'] if d['title'] in docDup: continue docDup.add(title) item=(mongoObj.conn_me).find_one({'title':title}) if item is None: continue entitylist=item['entitylist'].split('|') for en_title in entitylist: if title==en_title: continue t=Term('title',en_title) q=TermQuery(t) docs=searcher2.search(q,2) if docs.totalHits<=1: continue docID2=(docs.scoreDocs)[0].doc doc=searcher2.doc(docID2) finalDup[doc['title']]=doc print 'begin to clean index, there are %d dup records' %(len(finalDup)) for title in finalDup: doc=finalDup[title] # title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract name=doc['name'] value=doc['value'] category=doc['category'] skos_category=doc['skos_category'] all_text=doc['all_text'] raw_name=doc['raw_name'] raw_value=doc['raw_value'] abstract=doc['abstract'] print 'process '+title t=Term('title',title) q=TermQuery(t) w.deleteDocuments(q) addDoc(w,title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract) # process remaining records #global batch,cnt_batch #if cnt_batch>0: #w.addDocuments(batch) #cnt_batch=0 #del batch[:] w.close()
def __init__(self, path, lruTaxonomyWriterCacheSize=100): Observable.__init__(self) taxoDirectory = MMapDirectory(File(path)) taxoDirectory.setUseUnmap(False) self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(lruTaxonomyWriterCacheSize))
def __init__(self, index_dir_ngram='./ngram', index_dir_vocab='./vocab'): self._index_dir_ngram = index_dir_ngram self._index_dir_vocab = index_dir_vocab fs = MMapDirectory.open(File(index_dir_ngram)) self._searcher_ngram = IndexSearcher(DirectoryReader.open(fs))
def __init__(self, DATA_DIR, vocab, n_threads, max_terms_per_doc, index_name, index_name_term, docs_path, docs_path_term, use_cache): self.n_threads = n_threads self.index_folder = DATA_DIR + '/data/' + index_name + '/' # folder to store lucene's index. It will be created in case it does not exist. self.index_folder_term = DATA_DIR + '/data/' + index_name_term + '/' # folder to store lucene's index. It will be created in case it does not exist. self.local_index_folder = './' + index_name self.local_index_folder_term = './' + index_name_term self.use_cache = use_cache self.docs_path = docs_path self.docs_path_term = docs_path_term self.max_terms_per_doc = max_terms_per_doc self.env = lucene.initVM(initialheap='28g', maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.vocab = vocab BooleanQuery.setMaxClauseCount(2048) if not os.path.exists(self.index_folder): print 'Creating index at', self.index_folder if self.docs_path == self.docs_path_term: add_terms = True else: add_terms = False self.create_index(self.index_folder, self.docs_path, add_terms) if self.local_index_folder: print 'copying index from', self.index_folder, 'to', self.local_index_folder if os.path.exists(self.local_index_folder): print 'Folder', self.local_index_folder, 'already exists! Doing nothing.' else: shutil.copytree(self.index_folder, self.local_index_folder) self.index_folder = self.local_index_folder else: self.index_folder = self.index_folder fsDir = MMapDirectory(Paths.get(self.index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) if self.docs_path != self.docs_path_term: if not os.path.exists(self.index_folder_term): print 'Creating index at', self.index_folder_term self.create_index(self.index_folder_term, self.docs_path_term, add_terms=True) if self.local_index_folder_term: print 'copying index from', self.index_folder_term, 'to', self.local_index_folder_term if os.path.exists(self.local_index_folder_term): print 'Folder', self.local_index_folder_term, 'already exists! Doing nothing.' else: shutil.copytree(self.index_folder_term, self.local_index_folder_term) self.index_folder_term = self.local_index_folder_term else: self.index_folder_term = self.index_folder_term fsDir_term = MMapDirectory(Paths.get(self.index_folder_term)) self.searcher_term = IndexSearcher( DirectoryReader.open(fsDir_term)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=self.n_threads) self.cache = {} print 'Loading Title-ID mapping...' self.title_id_map, self.id_title_map = self.get_title_id_map()