def _gen_suffix(self,phrase): terms = phrase key = terms[0:SUFFIX_MIN_LENGTH] for c in terms[SUFFIX_MIN_LENGTH:]: yield (chinese_key(key), c, ord(c)) key = key + c yield (chinese_key(phrase), self.terminator, 0)
def put(self,title,item_id): """ title --> segment --> sadd(phrase,item_id) -> zadd (phrase->prefix, suffix, 0) --> pinyin --> sadd(phrase,item_id) -> zadd (phrase->prefix, suffix, 0) """ if not title or not item_id: return for phrase in mmseg.seg_txt(title.encode('utf8')): if not phrase: continue phrase = phrase.decode('utf8') self._add_phrase(chinese_key(phrase),item_id) for (key,suffix,score) in self._gen_suffix(phrase): self._add_suffix(key,chinese_key(suffix),score) if not self.pinyin: continue phrase = self.pinyin.translate(phrase) if not phrase: continue for sub_phrase in self._gen_pinyin_phrase(phrase): self._add_phrase(sub_phrase,item_id) for (key,suffix,score) in self._gen_suffix(re.sub('\\s+','',sub_phrase)): self._add_suffix(key,suffix,score)
def remove(self,title,item_id): if not title or not item_id: return for phrase in mmseg.seg_txt(title.encode('utf8')): if not phrase: continue phrase = phrase.decode('utf8') self._rem_phrase(chinese_key(phrase),item_id) if not self.pinyin: continue phrase = self.pinyin.translate(phrase) if not phrase: continue for sub_phrase in self._gen_pinyin_phrase(phrase): self._rem_phrase(sub_phrase,item_id)