def learn(self,path): """ 待学习的文件路径 """ count = 0 for line in open(path): count = count +1 line = unicode(line,'utf-8') tmp = [] for word in seg_text(line): if len(word) == 1: tmp.append(word) else: if len(tmp)>1: new_word=''.join(tmp) if new_word in self.cache: self.cache[new_word] = self.cache[new_word]+1 else: self.cache[new_word] = 1 del tmp tmp = [] new_word = ''.join(tmp) if new_word: if new_word in self.cache: self.cache[new_word] += 1 else: self.cache[new_word] = 1 if count%1000 == 0: print("count:%d" % count)
def test_seg(): seg_words = scseg.seg_text(u'研究生命起源') assert seg_words[0] == u'研究' assert seg_words[1] == u'生命' assert seg_words[2] == u'起源'