def init(cls, folder): PySeg.init(folder) customWordPath = os.path.join(folder, "custom_word") if not os.path.exists(customWordPath): return for fileName in os.listdir(customWordPath): path = os.path.join(customWordPath, fileName) wordFile = file(path) while True: line = wordFile.readline() if len(line) == 0: break PySeg.addUserWord(line.decode("utf8").encode("gbk")) wordFile.close() tongyiWordPath = os.path.join(folder, "tong_yi_word") if not os.path.exists(tongyiWordPath): return for fileName in os.listdir(tongyiWordPath): path = os.path.join(tongyiWordPath, fileName) wordFile = file(path) while True: line = wordFile.readline() if len(line) == 0: break seps = line.split() if len(seps) != 2: continue if seps[0] not in cls.tongyiPair: cls.tongyiPair[seps[0]] = seps[1] PySeg.addUserWord(seps[0].decode('utf8').encode("gbk")) PySeg.addUserWord(seps[1].decode('utf8').encode("gbk"))
def init(cls, folder): PySeg.init(folder) customWordPath = os.path.join(folder, "custom_word") if not os.path.exists(customWordPath): return for fileName in os.listdir(customWordPath): path = os.path.join(customWordPath, fileName) wordFile = file(path) while True: line = wordFile.readline() if len(line) == 0: break PySeg.addUserWord(line.decode("utf8").encode("gbk")) wordFile.close() tongyiWordPath = os.path.join(folder, "tong_yi_word") if not os.path.exists(tongyiWordPath): return for fileName in os.listdir(tongyiWordPath): path = os.path.join(tongyiWordPath, fileName) wordFile = file(path) while True: line = wordFile.readline() if len(line) == 0: break seps = line.split() if len(seps) != 2: continue if seps[0] not in cls.tongyiPair: cls.tongyiPair[seps[0]] = seps[1] PySeg.addUserWord(seps[0].decode('utf8').encode("gbk")) PySeg.addUserWord(seps[1].decode('utf8').encode("gbk"))
def CreateDB(self): options = { 'create_if_missing': True, 'error_if_exists': False, 'paranoid_checks': False, 'block_cache_size': 100 * (1 << 20), 'write_buffer_size': 2 * (1 << 20), 'block_size': 4096, 'max_open_files': 1000, 'block_restart_interval': 16 } self.db = leveldb.LevelDB(self.path, **options) self.logger = logging.getLogger("build_index") PySeg.init(self.path + "/../")
def SegForLtp(cls, content): words = PySeg.seg(content) res = [] for w in words: res.append((w[0].decode('utf8').encode('gb2312'), cls.ConvertPosICT2Ltp(w[1]))) return tuple(res)
def seg(cls, content): res = [] tmp = PySeg.seg(content) for t in tmp: if t[0] in cls.tongyiPair: res.append((cls.tongyiPair[t[0]], t[1])) else: res.append(t) return res
def seg(cls, content): res = [] tmp = PySeg.seg(content) for t in tmp: if t[0] in cls.tongyiPair: res.append((cls.tongyiPair[t[0]], t[1])) else: res.append(t) return res
def BuildIndexForOne(self, gameId, name, description, categorys, tags): terms = {} self.logger.debug( "build index for one %d %s %s %s %s" % (gameId, name, description, str(categorys), str(tags))) ts = PySeg.seg(name.encode('utf8')) terms[NameAddr] = [] for t in ts: if len(t[1]) > 0 and (t[1][0] == 'n' or t[1][0] == 'v' or t[1][0] == 'a'): terms[NameAddr].append(t[0]) ts = PySeg.seg(description.encode('utf8')) terms[DescAddr] = [] for t in ts: if len(t[1]) > 0 and (t[1][0] == 'n' or t[1][0] == 'v' or t[1][0] == 'a'): terms[DescAddr].append(t[0]) terms[CategoryAddr] = [] for c in categorys: terms[CategoryAddr].append(c.encode("utf8")) terms[TagAddr] = [] for t in tags: terms[TagAddr].append(t.encode('utf8')) term2Addrs = {} for k, v in terms.items(): for term in v: self.logger.debug("%d %s" % (k, term.decode('utf8'))) if term not in term2Addrs: term2Addrs[term] = [] if k not in term2Addrs[term]: term2Addrs[term].append(k) for term, addrs in term2Addrs.items(): self.logger.debug("term %s addrs %s" % (term.decode('utf8'), str(addrs))) item = DBItem(term, gameId, addrs) (k, v) = item.Encode() self.db.Put(k, v)
def SegForLtp(cls, content): words = PySeg.seg(content) res = [] for w in words: res.append((w[0].decode('utf8').encode('gb2312'), cls.ConvertPosICT2Ltp(w[1]))) return tuple(res)