Esempio n. 1
0
    def load_docs(self, docs_pt):
        print("load docs: " + docs_pt)
        rf = open(docs_pt)
        if not rf:
            print("file not found: " + docs_pt)

        for line in rf.readlines():
            d = Doc(line)
            biterms = []
            d.gen_biterms(biterms)
            # statistic the empirical word distribution
            for i in range(d.size()):
                w = d.get_w(i)
                self.pw_b[w] += 1
            for b in biterms:
                self.bs.append(b)

        self.pw_b.normalize()
Esempio n. 2
0
    def load_docs(self, docs_pt):
        '''
        @description: 
        @param docs_pt:
        @return: 
        '''
        print("load docs: " + docs_pt)
        rf = open(docs_pt)
        if not rf:
            print("file not found: " + docs_pt)

        for line in rf.readlines():
            d = Doc(line)
            biterms = []  #一句话里的单词能组成的词对。
            d.gen_biterms(biterms)
            # statistic the empirical word distribution
            for i in range(d.size()):
                w = d.get_w(i)
                self.pw_b[w] += 1  #这行代码是在统计词频
            for b in biterms:
                self.bs.append(b)  #self.bs中添加的是一个biterm类。类的内容是这段文本中所有可能的词的组合.
        self.pw_b.normalize()  #做归一化处理,现在 pw_b中保存的是 词:词频率。