def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = FSDirectory.getDirectory(storeDir, True) writer = IndexWriter(store, analyzer, True) writer.setMaxFieldLength(1048576) self.indexDocs(root, writer) print 'optimizing index', writer.optimize() writer.close() print 'done'
def Index(): field_list, conn, _config_dict = _InitIndexer() indexDir = _config_dict['indexDir'] if not os.path.exists(indexDir): os.mkdir(indexDir) store = SimpleFSDirectory(lucene.File(indexDir)) #print store writer = IndexWriter(store, SmartChineseAnalyzer(lucene.Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) writer.setMaxFieldLength(1048576) try: ticker = Ticker() ticker.start() _IndexDocs(writer, field_list, conn) ticker.end() ticker.TimeCost() except Exception, e: print "Failed in Indexing...", e traceback.print_exc()
def Index(): field_list, conn, _config_dict = _InitIndexer() indexDir = _config_dict["indexDir"] if not os.path.exists(indexDir): os.mkdir(indexDir) store = SimpleFSDirectory(lucene.File(indexDir)) # print store writer = IndexWriter( store, SmartChineseAnalyzer(lucene.Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED ) writer.setMaxFieldLength(1048576) try: ticker = Ticker() ticker.start() _IndexDocs(writer, field_list, conn) ticker.end() ticker.TimeCost() except Exception, e: print "Failed in Indexing...", e traceback.print_exc()
class IndexCorpus(object): def __init__(self, index_dir, analyzer, compress=False): self.metadata = True if not os.path.exists(index_dir): os.mkdir(index_dir) self.compress = compress store = SimpleFSDirectory(File(index_dir)) self.writer = IndexWriter(store, analyzer, True, IndexWriter.MaxFieldLength.LIMITED) self.writer.setMaxFieldLength(1048576) if self.compress: self.compressor = self.get_compressor() def get_compressor(self): path = '/Users/tal/corpus/analyzed/5/5344' training_data = codecs.open(path, encoding='utf8').read() return trained_short_string_compressor(training_data.encode('utf8')) def finalize(self): self.writer.optimize() self.writer.close() def index(self, directory): files = [x for x in os.listdir(directory) if x.isdigit()] for filename in sorted(files, key=int): path = os.path.join(directory, filename) if not filename.isdigit(): continue if os.path.isdir(path): self.index(path) else: if int(filename) % 100 == 0: print datetime.now().ctime(), filename try: self.index_file(path) except Exception, e: print "Indexing exception:", e
env=lucene.initVM() print 'Using Directory: ', STORE_DIR notExist = 0 # both the main program and the background indexer will share the same directory and analyzer if not os.path.exists(STORE_DIR): os.mkdir(STORE_DIR) notExist = 1 directory = SimpleFSDirectory(File(STORE_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # we will need a writer writer = IndexWriter(directory,analyzer,True,IndexWriter.MaxFieldLength.LIMITED) writer.setMaxFieldLength(1048576) if notExist == 1: writer.close() # and start the indexer # note the indexer thread is set to daemon causing it to terminate on a SIGINT folder = "tweets" indexer = Indexer(STORE_DIR,writer,folder) indexer.setDaemon(True) indexer.start() print 'Starting Indexer in background...' run(writer, analyzer) quit_gracefully()