def load_docs(self, docs_pt): print("load docs: " + docs_pt) rf = open(docs_pt) if not rf: print("file not found: " + docs_pt) for line in rf.readlines(): d = Doc(line) biterms = [] d.gen_biterms(biterms) # statistic the empirical word distribution for i in range(d.size()): w = d.get_w(i) self.pw_b[w] += 1 for b in biterms: self.bs.append(b) self.pw_b.normalize()
def load_docs(self, docs_pt): ''' @description: @param docs_pt: @return: ''' print("load docs: " + docs_pt) rf = open(docs_pt) if not rf: print("file not found: " + docs_pt) for line in rf.readlines(): d = Doc(line) biterms = [] #一句话里的单词能组成的词对。 d.gen_biterms(biterms) # statistic the empirical word distribution for i in range(d.size()): w = d.get_w(i) self.pw_b[w] += 1 #这行代码是在统计词频 for b in biterms: self.bs.append(b) #self.bs中添加的是一个biterm类。类的内容是这段文本中所有可能的词的组合. self.pw_b.normalize() #做归一化处理,现在 pw_b中保存的是 词:词频率。