Exemple #1
0
def load_docs(docs_pt):
    bs = []
    print("load docs: " + docs_pt)
    rf = open(docs_pt)
    if not rf:
        print("file not found: " + docs_pt)
    for line in rf.readlines():
        d = Doc(line)
        biterms = []
        d.gen_biterms(biterms)
        # statistic the empirical word distribution
        # for i in range(d.size()):
        #     w = d.get_w(i)
        #     pw_b[w] += 1
        for b in biterms:
            bs.append(b)
    # print(len(bs))
    return bs
Exemple #2
0
    def load_docs(self, docs_pt):
        print("load docs: " + docs_pt)
        rf = open(docs_pt)
        if not rf:
            print("file not found: " + docs_pt)

        for line in rf.readlines():
            d = Doc(line)
            biterms = []
            d.gen_biterms(biterms)
            # statistic the empirical word distribution
            for i in range(d.size()):
                w = d.get_w(i)
                self.pw_b[w] += 1
            for b in biterms:
                self.bs.append(b)

        self.pw_b.normalize()
Exemple #3
0
    def load_docs(self, docs_pt):
        '''
        @description: 
        @param docs_pt:
        @return: 
        '''
        print("load docs: " + docs_pt)
        rf = open(docs_pt)
        if not rf:
            print("file not found: " + docs_pt)

        for line in rf.readlines():
            d = Doc(line)
            biterms = []  #一句话里的单词能组成的词对。
            d.gen_biterms(biterms)
            # statistic the empirical word distribution
            for i in range(d.size()):
                w = d.get_w(i)
                self.pw_b[w] += 1  #这行代码是在统计词频
            for b in biterms:
                self.bs.append(b)  #self.bs中添加的是一个biterm类。类的内容是这段文本中所有可能的词的组合.
        self.pw_b.normalize()  #做归一化处理,现在 pw_b中保存的是 词:词频率。