def init(cls, folder): PySeg.init(folder) customWordPath = os.path.join(folder, "custom_word") if not os.path.exists(customWordPath): return for fileName in os.listdir(customWordPath): path = os.path.join(customWordPath, fileName) wordFile = file(path) while True: line = wordFile.readline() if len(line) == 0: break PySeg.addUserWord(line.decode("utf8").encode("gbk")) wordFile.close() tongyiWordPath = os.path.join(folder, "tong_yi_word") if not os.path.exists(tongyiWordPath): return for fileName in os.listdir(tongyiWordPath): path = os.path.join(tongyiWordPath, fileName) wordFile = file(path) while True: line = wordFile.readline() if len(line) == 0: break seps = line.split() if len(seps) != 2: continue if seps[0] not in cls.tongyiPair: cls.tongyiPair[seps[0]] = seps[1] PySeg.addUserWord(seps[0].decode('utf8').encode("gbk")) PySeg.addUserWord(seps[1].decode('utf8').encode("gbk"))
def init(cls, folder): PySeg.init(folder) customWordPath = os.path.join(folder, "custom_word") if not os.path.exists(customWordPath): return for fileName in os.listdir(customWordPath): path = os.path.join(customWordPath, fileName) wordFile = file(path) while True: line = wordFile.readline() if len(line) == 0: break PySeg.addUserWord(line.decode("utf8").encode("gbk")) wordFile.close() tongyiWordPath = os.path.join(folder, "tong_yi_word") if not os.path.exists(tongyiWordPath): return for fileName in os.listdir(tongyiWordPath): path = os.path.join(tongyiWordPath, fileName) wordFile = file(path) while True: line = wordFile.readline() if len(line) == 0: break seps = line.split() if len(seps) != 2: continue if seps[0] not in cls.tongyiPair: cls.tongyiPair[seps[0]] = seps[1] PySeg.addUserWord(seps[0].decode('utf8').encode("gbk")) PySeg.addUserWord(seps[1].decode('utf8').encode("gbk"))
def CreateDB(self): options = { 'create_if_missing': True, 'error_if_exists': False, 'paranoid_checks': False, 'block_cache_size': 100 * (1 << 20), 'write_buffer_size': 2 * (1 << 20), 'block_size': 4096, 'max_open_files': 1000, 'block_restart_interval': 16 } self.db = leveldb.LevelDB(self.path, **options) self.logger = logging.getLogger("build_index") PySeg.init(self.path + "/../")