Beispiel #1
0
 def count_word(self, text, use_type="text"):
     """
         词频统计(句子/段落/文章)
     :param text: str, path or doc, like "大漠帝国。" or "/home/data/doc.txt"
     :param use_type: str,  "text" or "file", file of "utf-8" of "txt"
     :return: class<Counter>, word-freq
     """
     self.words_count = Counter()
     if use_type == "text":  # 输入为文本形式
         texts = cut_sentence(use_type=self.algorithm,
                              text=text)  # 切句子, 如中英文的逗号/句号/感叹号
         for text in texts:
             n_grams = get_ngrams(use_type=self.algorithm,
                                  len_max=self.len_max,
                                  text=text)  # 获取一个句子的所有n-gram
             self.words_count.update(n_grams)
     elif use_type == "file":  # 输入为文件形式
         if not os.path.exists(text):
             raise RuntimeError("path of text must exist!")
         fr8 = open(text, "r", encoding="utf-8")
         for text in fr8:
             if text.strip():
                 texts = cut_sentence(use_type=self.algorithm,
                                      text=text)  # 切句子, 如中英文的逗号/句号/感叹号
                 for text in texts:
                     n_grams = get_ngrams(use_type=self.algorithm,
                                          len_max=self.len_max,
                                          text=text)  # 获取一个句子的所有n-gram
                     self.words_count.update(n_grams)
         fr8.close()
     else:
         raise RuntimeError("use_type must be 'text' or 'file'")
     self.total_words = sum(self.words_count.values())
Beispiel #2
0
    def deal_corpus(self):
        import json

        token2idx = self.ot_dict.copy()
        if 'term' in self.corpus_path:
            with open(file=self.corpus_path, mode='r', encoding='utf-8') as fd:
                while True:
                    term_one = fd.readline()
                    if not term_one:
                        break
                    if term_one not in token2idx:
                        token2idx[term_one] = len(token2idx)
        elif os.path.exists(self.corpus_path):
            with open(file=self.corpus_path, mode='r', encoding='utf-8') as fd:
                terms = fd.readlines()
                for line in terms:
                    ques_label = json.loads(line.strip())
                    term_one = ques_label["question"]
                    term_one = "".join(term_one)
                    if self.level_type == 'char':
                        text = list(term_one.replace(' ', '').strip())
                    elif self.level_type == 'word':
                        text = macropodus_cut(term_one)
                    elif self.level_type == 'ngram':
                        text = get_ngrams(term_one, ns=self.ngram_ns)
                    else:
                        raise RuntimeError(
                            "your input level_type is wrong, it must be 'word', 'char', 'ngram'"
                        )
                    for text_one in text:
                        if text_one not in token2idx:
                            token2idx[text_one] = len(token2idx)
        else:
            raise RuntimeError(
                "your input corpus_path is wrong, it must be 'dict' or 'corpus'"
            )
        self.token2idx = token2idx
        self.idx2token = {}
        for key, value in self.token2idx.items():
            self.idx2token[value] = key
Beispiel #3
0
    def summarize(self, text, num=320, title=None):
        """
            文本句子排序
        :param docs: list
        :return: list
        """
        # 切句
        if type(text) == str:
            self.sentences = cut_sentence(text)
        elif type(text) == list:
            self.sentences = text
        else:
            raise RuntimeError("text type must be list or str")
        self.title = title
        if self.title:
            self.title = macropodus_cut(title)
        # 切词,含词性标注
        self.sentences_tag_cut = [jieba_tag_cut(extract_chinese(sentence)) for sentence in self.sentences]
        # 词语,不含词性标注
        sentences_cut = [[jc for jc in jtc.keys() ] for jtc in self.sentences_tag_cut]
        # 去除停用词等
        self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
        # 词频统计
        self.words = []
        for sen in self.sentences_cut:
            self.words = self.words + sen
        self.word_count = dict(Counter(self.words))
        # 按频次计算词语的得分, 得到self.word_freq=[{'word':, 'freq':, 'score':}]
        self.word_freqs = {}
        self.len_words = len(self.words)
        for k, v in self.word_count.items():
            self.word_freqs[k] = v * 0.5 / self.len_words
        # uni_bi_tri_gram特征
        [gram_uni, gram_bi, gram_tri] = get_ngrams("".join(self.sentences), ns=[1, 2, 3])
        ngrams = gram_uni + gram_bi + gram_tri
        self.ngrams_count = dict(Counter(ngrams))
        # 句子位置打分
        scores_posi = self.score_position()
        # 句子长度打分
        scores_length = self.score_length()
        # 句子词性打分, 名词(1.2)-代词(0.8)-动词(1.0)
        scores_tag = self.score_tag()

        res_rank = {}
        self.res_score = []
        for i in range(len(sentences_cut)):
            sen_cut = self.sentences_cut[i]  # 句子中的词语
            # ngram得分
            [gram_uni_, gram_bi_, gram_tri_] = get_ngrams(self.sentences[i], ns=[1, 2, 3]) # gram_uni_bi_tri(self.sentences[i])
            n_gram_s = gram_uni_ + gram_bi_ + gram_tri_
            score_ngram = sum([self.ngrams_count[ngs] if ngs in self.ngrams_count else 0 for ngs in n_gram_s]) / (len(n_gram_s) + 1)
            # 句子中词语的平均长度
            score_word_length_avg = sum([len(sc) for sc in sen_cut])/(len(sen_cut)+1)
            score_posi = scores_posi[i]
            score_length = scores_length[i]
            score_tag = scores_tag[i]
            if self.title:  # 有标题的文本打分合并
                score_title = self.score_title(sen_cut)
                score_total = (score_title * 0.5 + score_ngram * 2.0 + score_word_length_avg * 0.5 +
                               score_length * 0.5 + score_posi * 1.0 + score_tag * 0.6) / 6.0
                # 可查阅各部分得分统计
                self.res_score.append(["score_title", "score_ngram", "score_word_length_avg",
                                       "score_length", "score_posi", "score_tag"])
                self.res_score.append([score_title, score_ngram, score_word_length_avg,
                                       score_length, score_posi, score_tag, self.sentences[i]])
            else:  # 无标题的文本打分合并
                score_total = (score_ngram * 2.0 + score_word_length_avg * 0.5 + score_length * 0.5 +
                               score_posi * 1.0 + score_tag * 0.6) / 5.0
                # 可查阅各部分得分统计
                self.res_score.append(["score_ngram", "score_word_length_avg",
                                       "score_length", "score_posi", "score_tag"])
                self.res_score.append([score_ngram, score_word_length_avg,
                                       score_length, score_posi, score_tag, self.sentences[i]])
            res_rank[self.sentences[i].strip()] = score_total
        # 最小句子数
        num_min = min(num, int(len(self.word_count) * 0.6))
        res_rank_sort = sorted(res_rank.items(), key=lambda rr: rr[1], reverse=True)
        res_rank_sort_reverse = [(rrs[1], rrs[0]) for rrs in res_rank_sort][0:num_min]
        return res_rank_sort_reverse