コード例 #1
0
    def __init__(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)
        store = FSDirectory.getDirectory(storeDir, True)
        writer = IndexWriter(store, analyzer, True)
        writer.setMaxFieldLength(1048576)
        self.indexDocs(root, writer)
        print 'optimizing index',
        writer.optimize()
        writer.close()
        print 'done'
コード例 #2
0
    def __init__(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)
        store = FSDirectory.getDirectory(storeDir, True)
        writer = IndexWriter(store, analyzer, True)
        writer.setMaxFieldLength(1048576)
        self.indexDocs(root, writer)
        print 'optimizing index',
        writer.optimize()
        writer.close()
        print 'done'
コード例 #3
0
ファイル: Indexer.py プロジェクト: BurnedRobot/SearchEngine
def Index():
    field_list, conn, _config_dict = _InitIndexer()

    indexDir = _config_dict['indexDir']
    if not os.path.exists(indexDir):
        os.mkdir(indexDir)
    store = SimpleFSDirectory(lucene.File(indexDir))
    #print store
    writer = IndexWriter(store,
                         SmartChineseAnalyzer(lucene.Version.LUCENE_CURRENT),
                         True, IndexWriter.MaxFieldLength.LIMITED)
    writer.setMaxFieldLength(1048576)
    try:
        ticker = Ticker()
        ticker.start()
        _IndexDocs(writer, field_list, conn)
        ticker.end()
        ticker.TimeCost()
    except Exception, e:
        print "Failed in Indexing...", e
        traceback.print_exc()
コード例 #4
0
ファイル: Indexer.py プロジェクト: BurnedRobot/SearchEngine
def Index():
    field_list, conn, _config_dict = _InitIndexer()

    indexDir = _config_dict["indexDir"]
    if not os.path.exists(indexDir):
        os.mkdir(indexDir)
    store = SimpleFSDirectory(lucene.File(indexDir))
    # print store
    writer = IndexWriter(
        store, SmartChineseAnalyzer(lucene.Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED
    )
    writer.setMaxFieldLength(1048576)
    try:
        ticker = Ticker()
        ticker.start()
        _IndexDocs(writer, field_list, conn)
        ticker.end()
        ticker.TimeCost()
    except Exception, e:
        print "Failed in Indexing...", e
        traceback.print_exc()
コード例 #5
0
ファイル: build.py プロジェクト: TalLinzen/hebrew-blog-corpus
class IndexCorpus(object):

    def __init__(self, index_dir, analyzer, compress=False):
        self.metadata = True
        if not os.path.exists(index_dir):
            os.mkdir(index_dir)

        self.compress = compress
        store = SimpleFSDirectory(File(index_dir))
        self.writer = IndexWriter(store, analyzer, True, 
                IndexWriter.MaxFieldLength.LIMITED)
        self.writer.setMaxFieldLength(1048576)

        if self.compress:
            self.compressor = self.get_compressor()
        
    def get_compressor(self):
        path = '/Users/tal/corpus/analyzed/5/5344'
        training_data = codecs.open(path, encoding='utf8').read()
        return trained_short_string_compressor(training_data.encode('utf8'))

    def finalize(self):
        self.writer.optimize()
        self.writer.close()

    def index(self, directory):
        files = [x for x in os.listdir(directory) if x.isdigit()]
        for filename in sorted(files, key=int):
            path = os.path.join(directory, filename)
            if not filename.isdigit():
                continue
            if os.path.isdir(path):
                self.index(path)
            else:
                if int(filename) % 100 == 0:
                    print datetime.now().ctime(), filename
                try:
                    self.index_file(path)
                except Exception, e:
                    print "Indexing exception:", e
コード例 #6
0
	env=lucene.initVM()
	print 'Using Directory: ', STORE_DIR
	
	notExist = 0
        
        # both the main program and the background indexer will share the same directory and analyzer
	if not os.path.exists(STORE_DIR):
		os.mkdir(STORE_DIR)
		notExist = 1
		
	directory = SimpleFSDirectory(File(STORE_DIR))
	analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
	
	# we will need a writer
	writer = IndexWriter(directory,analyzer,True,IndexWriter.MaxFieldLength.LIMITED)
	writer.setMaxFieldLength(1048576)
	
	if notExist == 1:
		writer.close()
	
	# and start the indexer
	# note the indexer thread is set to daemon causing it to terminate on a SIGINT
	folder = "tweets"
	indexer = Indexer(STORE_DIR,writer,folder)
	indexer.setDaemon(True)
	indexer.start()
	print 'Starting Indexer in background...'
	
	run(writer, analyzer)
	quit_gracefully()