Example #1
0
 def init(cls, folder):
     PySeg.init(folder)
     customWordPath = os.path.join(folder, "custom_word")
     if not os.path.exists(customWordPath):
         return
     for fileName in os.listdir(customWordPath):
         path = os.path.join(customWordPath, fileName)
         wordFile = file(path)
         while True:
             line = wordFile.readline()
             if len(line) == 0:
                 break
             PySeg.addUserWord(line.decode("utf8").encode("gbk"))
         wordFile.close()
     tongyiWordPath = os.path.join(folder, "tong_yi_word")
     if not os.path.exists(tongyiWordPath):
         return
     for fileName in os.listdir(tongyiWordPath):
         path = os.path.join(tongyiWordPath, fileName)
         wordFile = file(path)
         while True:
             line = wordFile.readline()
             if len(line) == 0:
                 break
             seps = line.split()
             if len(seps) != 2:
                 continue
             if seps[0] not in cls.tongyiPair:
                 cls.tongyiPair[seps[0]] = seps[1]
                 PySeg.addUserWord(seps[0].decode('utf8').encode("gbk"))
                 PySeg.addUserWord(seps[1].decode('utf8').encode("gbk"))
Example #2
0
 def init(cls, folder):
     PySeg.init(folder)
     customWordPath = os.path.join(folder, "custom_word")
     if not os.path.exists(customWordPath):
         return 
     for fileName in os.listdir(customWordPath):
         path = os.path.join(customWordPath, fileName)
         wordFile = file(path)
         while True:
             line = wordFile.readline()
             if len(line) == 0:
                 break
             PySeg.addUserWord(line.decode("utf8").encode("gbk"))
         wordFile.close()
     tongyiWordPath = os.path.join(folder, "tong_yi_word")
     if not os.path.exists(tongyiWordPath):
         return
     for fileName in os.listdir(tongyiWordPath):
         path = os.path.join(tongyiWordPath, fileName)
         wordFile = file(path)
         while True:
             line = wordFile.readline()
             if len(line) == 0:
                 break
             seps = line.split()
             if len(seps) != 2:
                 continue
             if seps[0] not in cls.tongyiPair:
                 cls.tongyiPair[seps[0]] = seps[1]
                 PySeg.addUserWord(seps[0].decode('utf8').encode("gbk"))
                 PySeg.addUserWord(seps[1].decode('utf8').encode("gbk"))
Example #3
0
 def CreateDB(self):
     options = {
         'create_if_missing': True,
         'error_if_exists': False,
         'paranoid_checks': False,
         'block_cache_size': 100 * (1 << 20),
         'write_buffer_size': 2 * (1 << 20),
         'block_size': 4096,
         'max_open_files': 1000,
         'block_restart_interval': 16
     }
     self.db = leveldb.LevelDB(self.path, **options)
     self.logger = logging.getLogger("build_index")
     PySeg.init(self.path + "/../")