コード例 #1
0
 def cal_weight_improve(self, key_words, class_label):
     """
     计算获取特征词后的权重信息
     :param key_words: [{'sentence': {}}, ...] or [{}, ...] 有可能是测试集数据有可能是训练集数据
     :return:
     """
     print "Cal Improve Weight: ", time.strftime('%Y-%m-%d %H:%M:%S')
     if not self.istrain:
         dir_ = os.path.join(TEXT_OUT, "key_words")
         filename = self.__class__.__name__ + ".txt" if self.subjective else self.__class__.__name__ + "_objective.txt"
         url = os.path.join(dir_, filename)
         train_key_words = FileUtil.read(url)
         train_class_label = [d.get("emotion-1-type") for d in train_key_words]
     else:
         train_key_words = key_words
         train_class_label = class_label
     train_key_words = [d.get("sentence") if "sentence" in d else d for d in train_key_words]
     key_words = [d.get("sentence") if "sentence" in d else d for d in key_words]
     # 获得 tf
     key_words = [{k: v / sum(d.values()) for k, v in d.items()} for d in key_words]
     fit_train_key_words = Feature_Hasher.transform(train_key_words)
     fit_key_words = Feature_Hasher.transform(key_words)
     tfidf = TfidfImprove()
     # 训练 idf
     tfidf.fit(fit_train_key_words, train_class_label)
     weight_matrix = tfidf.transform(fit_key_words, class_label)
     print "Cal Weight Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
     print
     return weight_matrix
コード例 #2
0
def _check_feature_size(url):
    l = []
    for line in FileUtil.read(url):
        line = ",".join(line.get("sentence"))
        line = line.split(",")
        l.append(line)

    feature_size = set(flatten(l))
    return len(feature_size)
コード例 #3
0
    def _get_splited_train(self):
        """
        优先从文件中读取训练集分词后的结果
        :return:
        """
        dir_ = os.path.join(TEXT_OUT, "split")
        if self.subjective:
            split_txt = os.path.join(dir_, self.__class__.__name__ + ".txt")
            training_datas = Load.load_training_balance()
        else:
            split_txt = os.path.join(dir_, self.__class__.__name__ + "_objective.txt")
            training_datas = Load.load_training_objective_balance()

        if self.f or not FileUtil.isexist(split_txt) or FileUtil.isempty(split_txt):
            # 加载训练集
            # 每个句子还包含类别信息
            splited_words_list = Feature.__split(flatten(training_datas))
#            splited_words_list = Feature.__del_low_frequency_word(splited_words_list)

            FileUtil.write(split_txt, splited_words_list)
        else:
            splited_words_list = FileUtil.read(split_txt)

        return splited_words_list
コード例 #4
0
    def _collect(self, splited_words_list, sentence_size):
        dir_ = os.path.join(TEXT_OUT, "key_words")
        if self.subjective:
            key_words_txt = os.path.join(dir_, self.__class__.__name__ + ".txt")
        else:
            key_words_txt = os.path.join(dir_, self.__class__.__name__ + "_objective.txt")
#        def norm(word_scores):
#            """
#            以样本为单位正则化
#            归一化(正则化)
#            Normalization 主要思想是对每个样本计算其p-范数,然后对该样本中每个元素除以该范数,
#            这样处理的结果是使得每个处理后样本的p-范数(l1-norm,l2-norm)等于1。
#
#            p-范数的计算公式:||X||p=(|x1|^p+|x2|^p+...+|xn|^p)^1/p
#
#            该方法主要应用于文本分类和聚类中。
#
#            :param word_scores: a dict {word: score}
#            """
#            p = 0.0
#            for v in word_scores.values():
#                p += math.pow(math.fabs(v), 2)
#            p = math.pow(p, 1.0 / 2)
#
#            for k, v in word_scores.items():
#                word_scores[k] = v / p

#        def reduce_dim(word_scores):
#            """
#            降维:选取累加权重信息占比超过 0.9 的特征词
#            """
#            _size = len(word_scores)
#            _max = math.pow(_size, 1.0 / 2) * 0.85
#            res = {}
#            # 降序排序
#            sort = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
#            _sum = 0.0
#            for k, v in sort:
#                if(_sum > _max):
#                    break
#                res[k] = v
#                _sum += v
#            return res

        if not self.istrain or self.f or not FileUtil.isexist(key_words_txt) or FileUtil.isempty(key_words_txt):
            print "Cal Scores: ", time.strftime('%Y-%m-%d %H:%M:%S')
            if len(splited_words_list) == sentence_size:
                train_range = slice(sentence_size)
            else:
                train_range = slice(sentence_size, len(splited_words_list))

            # 获取所有类别下的文本
            all_class_datas = Feature.all_class_text(splited_words_list[train_range], self.getclasses())

            # 获取类别标签
            class_label = [d.get("emotion-1-type") for d in splited_words_list[: sentence_size]]

            # return term/frequency or term/score
            res = []
            for splited_words_dict in splited_words_list[0: sentence_size]:
                splited_words = splited_words_dict.get("sentence")
                label = splited_words_dict.get("emotion-1-type")
                # 计算每个单词的得分 scores: {word: [score, frequency], ...}
                scores = {splited_word: [self.cal_score(splited_word, splited_words, label, all_class_datas,
                                                        [d.get("sentence") for d in splited_words_list[train_range]]),
                                         frequency]
                          for splited_word, frequency in splited_words.items()}
                # 归一化
                # norm(scores)
                # 降维处理
                sorted_words = scores
#                if not self.istrain:
#                    sorted_words = reduce_dim(scores)

                # Collection
                # if False return term/score
                # if True  return term/frequency
#                if False:
#                    for k in sorted_words.keys():
#                        sorted_words[k] = splited_words.count(k)

                res.append({"sentence": sorted_words,
                            "emotion-1-type": splited_words_dict.get("emotion-1-type")})
            print "Cal Scores Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
            # FileUtil.write(TEST_BASE_URL + "scores.txt", res)
            print "Begin Normalization: ", time.strftime('%Y-%m-%d %H:%M:%S')
            # 归一化
            self.norm(res)
            # FileUtil.write(TEST_BASE_URL + "norm.txt", res)
            print "Normalization Done: ", time.strftime('%Y-%m-%d %H:%M:%S')

            print "Begin Reduce: ", time.strftime('%Y-%m-%d %H:%M:%S')
            # 降维
            self.reduce_dim(res)
            print "Reduce Done: ", time.strftime('%Y-%m-%d %H:%M:%S')

            # Try Convert term/score to term/frequency
            # if False return term/score
            # if True  return term/frequency
            for d in res:
                ws = d.get("sentence")
                for k, v in ws.items():
                    ws[k] = v[0]
                    if True:
                        ws[k] = v[1]

            # 由于分词或降维的过程中,有可能因为样本的信息关键词不够,
            # 使得该样本经过上诉步骤后为空,返回这类样本的索引
            danger_index = []
            res = filter(lambda x: danger_index.append(x[0]) if not x[1].get("sentence") else x,
                         enumerate(res))
            res = list(zip(*res)[1])

            class_label = [c for i, c in enumerate(class_label)
                           if i not in danger_index]

            # 写入文件
            if self.istrain:
                FileUtil.write(key_words_txt, res)
        else:
            res = FileUtil.read(key_words_txt)
            class_label = [r["emotion-1-type"] for r in res]
            danger_index = []

        # 输出统计信息
        if False:
            self.__print_top_key_word(res)
        return res, class_label, danger_index