def SegForLtp(cls, content): words = PySeg.seg(content) res = [] for w in words: res.append((w[0].decode('utf8').encode('gb2312'), cls.ConvertPosICT2Ltp(w[1]))) return tuple(res)
def seg(cls, content): res = [] tmp = PySeg.seg(content) for t in tmp: if t[0] in cls.tongyiPair: res.append((cls.tongyiPair[t[0]], t[1])) else: res.append(t) return res
def BuildIndexForOne(self, gameId, name, description, categorys, tags): terms = {} self.logger.debug( "build index for one %d %s %s %s %s" % (gameId, name, description, str(categorys), str(tags))) ts = PySeg.seg(name.encode('utf8')) terms[NameAddr] = [] for t in ts: if len(t[1]) > 0 and (t[1][0] == 'n' or t[1][0] == 'v' or t[1][0] == 'a'): terms[NameAddr].append(t[0]) ts = PySeg.seg(description.encode('utf8')) terms[DescAddr] = [] for t in ts: if len(t[1]) > 0 and (t[1][0] == 'n' or t[1][0] == 'v' or t[1][0] == 'a'): terms[DescAddr].append(t[0]) terms[CategoryAddr] = [] for c in categorys: terms[CategoryAddr].append(c.encode("utf8")) terms[TagAddr] = [] for t in tags: terms[TagAddr].append(t.encode('utf8')) term2Addrs = {} for k, v in terms.items(): for term in v: self.logger.debug("%d %s" % (k, term.decode('utf8'))) if term not in term2Addrs: term2Addrs[term] = [] if k not in term2Addrs[term]: term2Addrs[term].append(k) for term, addrs in term2Addrs.items(): self.logger.debug("term %s addrs %s" % (term.decode('utf8'), str(addrs))) item = DBItem(term, gameId, addrs) (k, v) = item.Encode() self.db.Put(k, v)