Beispiel #1
0
    def _get_class_tf(self, class_dir_path, class_name, reload_cache=False):
        """Build The Classes Word Dict Cache

            用于取得某分类所有词的词频缓存

            Key Arguments
            class_dir_path -- the train set class dir path
            class_name     -- the classname
            reload_cache   -- if reload the class cache or not (default False)

        """
        class_tf = {}
        if not os.path.exists(class_dir_path):
            print("输入的分类路径不合法!")
            return None
        else:
            _cache_file_path = os.path.join(self._cache_dir_path,class_name+"-tf.cache")
            _cache_word_number_file_path = os.path.join(self._cache_dir_path, class_name + "-word-num.cache")
            if os.path.exists(_cache_word_number_file_path) and os.path.exists(_cache_word_number_file_path) and not reload_cache:
                default_logger.debug("[CLS] The Class Cache is Already Existed, skip the Cache Building...")
                return None
            _base_dir_path = os.path.abspath(class_dir_path)
            # 开始遍历文件夹
            files = os.listdir(class_dir_path)
            for file in files:
                full_file_path = os.path.join(_base_dir_path,file)

                _temp_tf_dic = tfidf.getTF(self._get_file_content(full_file_path), self.chcut)
                for _tmp_word in _temp_tf_dic:
                    _tmp_class_tf = 0
                    if class_tf.get(_tmp_word) is not None:
                        _tmp_class_tf = class_tf.get(_tmp_word)

                    _tmp_document_tf = _temp_tf_dic.get(_tmp_word)
                    _sum_tf = _tmp_class_tf + _tmp_document_tf

                    if class_tf.get(_tmp_word) is not None:
                        class_tf[_tmp_word] = _sum_tf
                    else:
                        class_tf.update({_tmp_word: _sum_tf})

            # 序列化存储
            default_logger.debug("[CLS] Class Cache Path: %s" % _cache_file_path)
            #序列化存储词频
            _cache_file = open(_cache_file_path,'wb')
            pickle.dump(class_tf,_cache_file)
            _cache_file.close()
            #序列化存储词的数目
            _cache_word_number_file = open(_cache_word_number_file_path,'wb')
            _word_count_dic  = {class_name:0}
            for ctf in class_tf:
                _word_count_dic[class_name] += class_tf.get(ctf)
                # _word_count = class_tf.get(ctf)
            pickle.dump(_word_count_dic, _cache_word_number_file)
            print(_word_count_dic)
            print("缓存成功...")
Beispiel #2
0
    def _get_all_tf(self):
        """ Build the All Word Cache

            建立全词缓存,便于再次计算
        """
        if os.path.exists(self._all_word_tf_cache) and os.path.isfile(self._all_word_tf_cache):
            default_logger.debug("[CLS] Full Word Cache has been Built.")
        else:
            try:
                # 全词词频
                open(self._all_word_tf_cache, 'wb')
                # 全词数目
                open(self._all_word_num_cache, 'wb')
                # 取得所有文本路径
                _train_paths = pathWalker.pathwalker(self.parent_dir)
                _trainset = _train_paths['train']
                # print(trainset)
                _train_set_files = []
                for cls in _trainset:
                    for _tmp_class_file in os.listdir(cls['classpath']):
                        _train_set_files.append(os.path.join(cls['classpath'], _tmp_class_file))
                # 所有训练集的路径
                print(_train_set_files)
                # 定义全词缓存字典
                _all_tf_cache = {}
                # 开始取词频缓存
                for train_file in _train_set_files:
                    _temp_tf_dic = tfidf.getTF(self._get_file_content(train_file), self.chcut)
                    for _tmp_word in _temp_tf_dic:
                        _tmp_class_tf = 0
                        if _all_tf_cache.get(_tmp_word) is not None:
                            _tmp_class_tf = _all_tf_cache.get(_tmp_word)

                        _tmp_document_tf = _temp_tf_dic.get(_tmp_word)
                        _sum_tf = _tmp_class_tf + _tmp_document_tf

                        if _all_tf_cache.get(_tmp_word) is not None:
                            _all_tf_cache[_tmp_word] = _sum_tf
                        else:
                            _all_tf_cache.update({_tmp_word: _sum_tf})

                # 序列化存储
                print(self._all_word_tf_cache)
                # 序列化存储词频
                _cache_file = open(self._all_word_tf_cache, 'wb')
                pickle.dump(_all_tf_cache, _cache_file)
                _cache_file.close()
                # 序列化存储词的数目
                _cache_word_number_file = open(self._all_word_num_cache, 'wb')
                _word_count_dic = {"all": 0}
                for ctf in _all_tf_cache:
                    _word_count_dic['all'] += _all_tf_cache.get(ctf)
                    # _word_count = class_tf.get(ctf)
                pickle.dump(_word_count_dic, _cache_word_number_file)
                print(_word_count_dic)
                default_logger.debug("[CLS] Cache has been Built Successfully.")


            except:
                # 建立全词缓存出错
                default_logger.debug("[CLS] Cache has been Built Failed!!")
                return