Ejemplo n.º 1
0
 def get_document_chapiter(self, sims, dictname) :
     
     textfname = "../dictionary/text/" + dictname  + ".txt"
     law_document = LawDocument()
     law_document.analyze(filename=textfname)
     text = "";
     n_line = 1
     for sim in sims :
         doc_no, simil = sim[0], sim [1]
         if (simil > 0.4) :
             text +=  "******** " + str(n_line) + "  ********\n"
             text += law_document.get_document_chapiter(doc_no) + "\n"
             n_line += 1
             if n_line > 2:
                 break;
         else :
             break
     return text
Ejemplo n.º 2
0
class Text4Sentences(object):
    def __init__(self, stopwords_file=None):
        """
        Keyword arguments:
        stopwords_file :    stopwords file name
        """

        self.pagerank_config = {
            'alpha': 0.85,
        }

        self.seg = Segmentation(stopwords_file=stopwords_file)
        self.law_document = LawDocument()
        self.sentences = None
        self.words_no_filter = None  # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None

        self.key_sentences = None

    def create_segment_sentences(self,
                                 sentences,
                                 sim_func=util.get_similarity):
        """
        Keyword arguments:
        
        sentences : sentences of document
        
        sim_func 指定计算句子相似度的函数。
        
        """

        self.words_no_filter, self.words_no_stop_words, self.words_all_filters = self.seg.segment(
            sentences)
        self.sentences = sentences

        self.key_sentences = util.sort_sentences(
            sentences=self.sentences,
            words=self.words_no_filter,
            sim_func=sim_func,
            pagerank_config=self.pagerank_config)

    def analyze_file(self, filename, encoding='utf-8'):
        """
        Keyword arguments:
        
        filename : input file name
        
        
        """

        f = self.law_document.create_document(filename=filename)

        self.create_segment_sentences(
            self.law_document.get_segmented_document())

    def get_key_sentences(self, num=6):
        """
        num : 个句子用来生成摘要。

        Return: important sentences。
        """

        result = []
        count = 0
        for item in self.key_sentences:
            if count >= num:
                break
            result.append(item)
            count += 1
        return result

    def show_key_sentences(self):

        for item in self.get_key_sentences(2):
            [sentence, idx, stype] = item['sentence']
            print(sentence)
            print("=" * 20)
            print(self.law_document.get_document_chapiter(idx, chapiter=True))
            print("--" * 20)
Ejemplo n.º 3
0
class Summarize(object):

    CLUSTER_THRESHOLD = 5  # Distance between words to consider

    def __init__(self, filename=None):
        """
        """
        self.law_document = LawDocument()
        self.important_word = []
        self.top_n_scored = []
        self.mean_scored = []

    def load_keywords(self):

        filename = self.law_document.get_keywords_file_name()
        f = codecs.open(filename, 'r', 'utf-8')

        self.important_word = []
        for line in f:
            if line.strip():
                tokens = line.strip().split(" ")
                if tokens[0].strip():
                    word = [tokens[0].strip()]

                    if len(tokens) > 1 and tokens[1].strip():
                        word.append(int(tokens[1].strip()))
                    else:
                        word.append(0)

                    if len(tokens) > 2 and tokens[2].strip():
                        word.append(int(tokens[2].strip()))

                    self.important_word.append(word)
        f.close()
        return self.important_word

    def _cluster_sentences(self, s, important_word):

        word_idx = []
        clusters = []
        # For each word in the keyword list
        for [w, n] in important_word:
            word = w.strip()
            if word:
                try:
                    index = s.index(word)
                    word_idx.append(index)
                    if n == 1:
                        index = index + 1
                        word_idx.append(index)
                except ValueError:  # w not in this particular sentence
                    pass

        # Using the word index, compute clusters by using a max distance threshold,
        # for any two consecutive words
        if len(word_idx) > 0:
            word_idx.sort()
            cluster = [word_idx[0]]
            i = 1
            while i < len(word_idx):
                if word_idx[i] - word_idx[i - 1] < self.CLUSTER_THRESHOLD:
                    cluster.append(word_idx[i])
                else:
                    clusters.append(cluster[:])
                    cluster = [word_idx[i]]
                i += 1
            clusters.append(cluster)

        return clusters

    def _score_sentences(self, sentences, important_word):
        scores = []
        sentence_idx = -1

        for [s, idx, type] in sentences:
            sentence_idx += 1
            clusters = self._cluster_sentences(s, important_word)

            if len(clusters) == 0:
                continue
            # Score each cluster. The max score for any given cluster is the score
            # for the sentence

            max_cluster_score = 0
            for c in clusters:
                significant_words_in_cluster = len(c)
                total_words_in_cluster = c[-1] - c[0] + 1
                score = 1.0 * significant_words_in_cluster \
                    * significant_words_in_cluster / total_words_in_cluster
                if score > max_cluster_score:
                    max_cluster_score = score

                if score > max_cluster_score:
                    max_cluster_score = score

            scores.append((sentence_idx, score))

        return scores

    def analyze(self, filename, withWeight=True, encoding="utf-8"):

        self.law_document.analyze_file(filename)

        self.load_keywords()

        scored_sentences = self._score_sentences(self.law_document.sentences,
                                                 self.important_word)

        # Summaization Approach 1:
        # Filter out non-significant sentences by using the average score plus a
        # fraction of the std dev as a filter

        avg = numpy.mean([s[1] for s in scored_sentences])
        std = numpy.std([s[1] for s in scored_sentences])

        ff = avg + 0.5 * std
        self.mean_scored = []
        for (sent_idx, score) in scored_sentences:
            if score > ff:
                self.mean_scored.append((sent_idx, score))

        # Summarization Approach 2:
        # Another approach would be to return only the top N ranked sentences

        self.top_n_scored = sorted(scored_sentences, key=lambda s: s[1])
        self.top_n_scored = sorted(self.top_n_scored, key=lambda s: s[0])

    def write_top_summarize(self, show_nb=5, outputfile=None, mode="a+"):
        if outputfile != None:
            f = codecs.open(outputfile, mode, 'utf-8')
            f.write(' '.join(self.law_document.document_title) + "\n")
            f.write('\n'.join(self.law_document.table_contents))
            f.write("\n\n摘要 : \n")

        else:
            f = None
            print('摘要 : ' + ' '.join(self.law_document.document_title) + "\n")

        n_sentence = 0
        for (idx, score) in self.top_n_scored:
            if n_sentence < show_nb:
                sentence = self.law_document.get_document_chapiter(idx)
                if sentence:
                    if f != None:
                        f.write(sentence + "\n\n")
                    else:
                        print(sentence)
                        print("=" * 20)

                    n_sentence += 1
            else:
                if f != None:
                    f.write("\n" + "*" * 30 + "\n\n")
                break

    def write_summarize(self, show_nb=5, outputfile=None, mode="a+"):

        if outputfile != None:
            f = codecs.open(outputfile, mode, 'utf-8')
            f.write('摘要 : \n' + ' '.join(self.law_document.document_title) +
                    "\n")
        else:
            f = None
            print('摘要 : ' + ' '.join(self.law_document.document_title) + "\n")

        self.law_document.init_sentence_index()
        n_sentence = 0
        for (idx, score) in self.mean_scored:
            if n_sentence < show_nb:
                sentence = self.law_document.get_document_chapiter(idx)
                if sentence:
                    if f != None:
                        f.write(sentence)
                    else:
                        print(sentence)
                        print(" " * 20)
                    n_sentence += 1

            else:
                if f != None:
                    f.write("*" * 30)
                break