Ejemplo n.º 1
0
    def _get_all_tf(self):
        """ Build the All Word Cache

            建立全词缓存,便于再次计算
        """
        if os.path.exists(self._all_word_tf_cache) and os.path.isfile(self._all_word_tf_cache):
            default_logger.debug("[CLS] Full Word Cache has been Built.")
        else:
            try:
                # 全词词频
                open(self._all_word_tf_cache, 'wb')
                # 全词数目
                open(self._all_word_num_cache, 'wb')
                # 取得所有文本路径
                _train_paths = pathWalker.pathwalker(self.parent_dir)
                _trainset = _train_paths['train']
                # print(trainset)
                _train_set_files = []
                for cls in _trainset:
                    for _tmp_class_file in os.listdir(cls['classpath']):
                        _train_set_files.append(os.path.join(cls['classpath'], _tmp_class_file))
                # 所有训练集的路径
                print(_train_set_files)
                # 定义全词缓存字典
                _all_tf_cache = {}
                # 开始取词频缓存
                for train_file in _train_set_files:
                    _temp_tf_dic = tfidf.getTF(self._get_file_content(train_file), self.chcut)
                    for _tmp_word in _temp_tf_dic:
                        _tmp_class_tf = 0
                        if _all_tf_cache.get(_tmp_word) is not None:
                            _tmp_class_tf = _all_tf_cache.get(_tmp_word)

                        _tmp_document_tf = _temp_tf_dic.get(_tmp_word)
                        _sum_tf = _tmp_class_tf + _tmp_document_tf

                        if _all_tf_cache.get(_tmp_word) is not None:
                            _all_tf_cache[_tmp_word] = _sum_tf
                        else:
                            _all_tf_cache.update({_tmp_word: _sum_tf})

                # 序列化存储
                print(self._all_word_tf_cache)
                # 序列化存储词频
                _cache_file = open(self._all_word_tf_cache, 'wb')
                pickle.dump(_all_tf_cache, _cache_file)
                _cache_file.close()
                # 序列化存储词的数目
                _cache_word_number_file = open(self._all_word_num_cache, 'wb')
                _word_count_dic = {"all": 0}
                for ctf in _all_tf_cache:
                    _word_count_dic['all'] += _all_tf_cache.get(ctf)
                    # _word_count = class_tf.get(ctf)
                pickle.dump(_word_count_dic, _cache_word_number_file)
                print(_word_count_dic)
                default_logger.debug("[CLS] Cache has been Built Successfully.")


            except:
                # 建立全词缓存出错
                default_logger.debug("[CLS] Cache has been Built Failed!!")
                return
Ejemplo n.º 2
0
if __name__ == '__main__':
    '''
        本文件是主要的测试文件,用于测试贝叶斯分类器的准确率和召回率
        Precision(c) = 所有被正确归为c类的页面/所有被归为c类的页面(错的也算)
        Recall(c)    = 所有被正确归为c类的页面/所有本应被归为c类的页面(不算错的)
    '''
    # 每个分类测试的文章数目
    MAXITEM =200

    # 初始化贝叶斯分类器
    classifitor = classifier.Classifier()
    # 开始计时
    starttime = time.time()
    # 取得测试集路径
    ALLSet = pathWalker.pathwalker("Parent")
    TestSet = ALLSet['test']

    classficiation_list = []
    classficiation_count = {}
    for cls in TestSet:
        clsName = cls.get("classname")
        clsSet = os.listdir(cls.get("classpath"))
        # 取得每个分类的计数
        classficiation_count.update({clsName:len(clsSet)})
        print(clsSet)
        # 测试分类

        test_list1 = ['it', 'education']
        # 存储某一篇测试结果的字典
        for index, clsfilename in enumerate(clsSet):