コード例 #1
0
def test_rouge_n():
    candidate_text = "pulses may ease schizophrenic voices"
    candidate = PlaintextParser(candidate_text,
                                Tokenizer("english")).document.sentences

    reference1_text = "magnetic pulse series sent through brain may ease schizophrenic voices"
    reference1 = PlaintextParser(reference1_text,
                                 Tokenizer("english")).document.sentences

    reference2_text = "yale finds magnetic stimulation some relief to schizophrenics imaginary voices"

    reference2 = PlaintextParser.from_string(
        reference2_text, Tokenizer("english")).document.sentences

    assert rouge_n(candidate, reference1, 1) == approx(4 / 10)
    assert rouge_n(candidate, reference2, 1) == approx(1 / 10)

    assert rouge_n(candidate, reference1, 2) == approx(3 / 9)
    assert rouge_n(candidate, reference2, 2) == approx(0 / 9)

    assert rouge_n(candidate, reference1, 3) == approx(2 / 8)
    assert rouge_n(candidate, reference2, 3) == approx(0 / 8)

    assert rouge_n(candidate, reference1, 4) == approx(1 / 7)
    assert rouge_n(candidate, reference2, 4) == approx(0 / 7)
コード例 #2
0
    def test_rouge_l_summary_level(self):
        reference_text = "one two three four five. one two three four five."
        reference = PlaintextParser(reference_text,
                                    Tokenizer("english")).document.sentences

        candidate_text = "one two six seven eight. one three eight nine five."
        candidates = PlaintextParser(candidate_text,
                                     Tokenizer("english")).document.sentences
        rouge_l_summary_level(candidates, reference)
コード例 #3
0
def test_union_lcs():
    reference_text = "one two three four five"
    reference = PlaintextParser(reference_text,
                                Tokenizer("english")).document.sentences

    candidate_text = "one two six seven eight. one three eight nine five."
    candidates = PlaintextParser(candidate_text,
                                 Tokenizer("english")).document.sentences

    assert _union_lcs(candidates, reference[0]) == approx(4 / 5)
コード例 #4
0
    def test_union_lcs(self):
        reference_text = "one two three four five"
        reference = PlaintextParser(reference_text,
                                    Tokenizer("english")).document.sentences

        candidate_text = "one two six seven eight. one three eight nine five."
        candidates = PlaintextParser(candidate_text,
                                     Tokenizer("english")).document.sentences

        self.assertAlmostEqual(_union_lcs(candidates, reference[0]), 4 / 5)
コード例 #5
0
 def get_summarized(self, input, num_sentences ):
     parser = PlaintextParser(input, Tokenizer("english"))
     summarizer = LexRankSummarizer()
     #Summarize the document with defined no.of sentences
     summary = summarizer(parser.document, num_sentences)
     list_summary = [str(item) for item in summary]
     return list_summary
コード例 #6
0
def get_parser(url, tokenizer):
    useragent = ' '.join([
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)",
        "AppleWebKit/537.36 (KHTML, like Gecko)",
        "Chrome/52.0.2743.116 Safari/537.36"
    ])

    # Scrape Web Page With HTMLParser and Goose and select the best scrape
    html_parser = HtmlParser.from_url(url, tokenizer)
    article = Goose({'browser_user_agent': useragent})

    # Goose raises IndexError when requesting unfamiliar sites.
    try:
        extract = article.extract(url=url)
    except:
        extract = article.extract(raw_html=requests.get(url).text)

    goose_parser = PlaintextParser(extract, tokenizer)

    # Aggregate Site Metadata
    meta = {
        k: v
        for (k, v) in extract.infos.items()
        if k not in ('cleaned_text', 'links', 'tweets', 'movies')
    }
    # Select Best Parser
    parser = (
        html_parser
        if len(goose_parser.document.words) < len(html_parser.document.words)
        else  # noqa
        goose_parser)

    return parser, meta
コード例 #7
0
def get_best_sentences(text, num=1):
    sentence_count = num
    parser = PlaintextParser(text, Tokenizer('english'))
    stemmer = Stemmer('english')
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words('english')
    return [unicode(s) for s in summarizer(parser.document, sentence_count)]
コード例 #8
0
def summarize_terms_text(txt):
    text_data = unidecode.unidecode(txt)
    clean_list, pure_list = prepare_for_regex(text_data)

    data_to_summarize = []
    for clean, pure in zip(clean_list, pure_list):
        if re.findall(clause, clean):
            data_to_summarize.append(pure)

    text_data = " ".join(data_to_summarize)
    parser = PlaintextParser(text_data, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = KLSummarizer(stemmer)

    summary = summarizer(parser.document, SENTENCES_COUNT)

    if len(summary) == 0:
        summary = ["No Terms"]

    sentences = [str(x) for x in summary]
    message = HTML_OPEN + "<ul class='rolldown-list' id='myList'>"

    you_agree = []
    they_agree = []
    other_clause = []
    for sentence in sentences:
        # TODO: logging in the future
        lower = sentence.lower()
        you_idx = lower.find("you")
        they_idx = lower.find("we")
        if (you_idx == -1 or you_idx > 15) and (they_idx == -1
                                                or they_idx > 15):
            other_clause.append(sentence)
        elif you_idx == -1:
            they_agree.append(sentence)
        elif they_idx == -1:
            you_agree.append(sentence)
        elif you_idx < they_idx:
            you_agree.append(sentence)
        else:
            they_agree.append(sentence)

    if len(you_agree) > 0:
        message += YOU_AGREE_HEADER + "<li>"
        message += "</li><li>".join(you_agree)
        message += "</li>"

    if len(they_agree) > 0:
        message += THEY_AGREE_HEADER + "<li>"
        message += "</li><li>".join(they_agree)
        message += "</li>"

    if len(other_clause) > 0:
        message += OTHER_HEADER + "<li>"
        message += "</li><li>".join(other_clause)
        message += "</li>"

    message += "</ul></div>"

    return json.dumps(message)
コード例 #9
0
def summarize(text, sentence_count, summarizer_type="lsa"):
    summarizer_class = AVAILABLE_METHODS[summarizer_type]
    parser = PlaintextParser(text, Tokenizer(LANGUAGE))
    # Default is english
    stemmer = Stemmer(LANGUAGE)
    summarizer = summarizer_class(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    return summarizer(parser.document, sentence_count)
コード例 #10
0
def get_summary_per_section_edmund(cur_sents,each_summ_num):
    summarizer = EdmundsonSummarizer()
    parser = PlaintextParser(cur_sents, Tokenizer(LANGUAGE))
    summ = summarizer(parser.document, each_summ_num)
    decoded = []
    for line in summ:
        decoded.append(line._text)
    return decoded
コード例 #11
0
 def summary(self, int1, int2):
     # int1, int2 are the places between which to look for
     # the summary to be taken (slicing the corpus as a string)
     parser = PlaintextParser(self.corpus[int1:int2], Tokenizer("english"))
     summarizer = LsaSummarizer(stem_word)
     summarizer.stop_words = get_stop_words("english")
     self.summary_text = " ".join(
         map(lambda x: x._text, summarizer(parser.document, 20)))
     return self.summary_text
コード例 #12
0
def summarize(url, language, sentences):
    """Return a generated summary of url content."""
    text = get_text(url)
    parser = PlaintextParser(text, Tokenizer(language))
    summarizer = LsaSummarizer(Stemmer(language))
    summarizer.stop_words = get_stop_words(language)

    for sentence in summarizer(parser.document, sentences):
        print(sentence)
コード例 #13
0
ファイル: bot.py プロジェクト: StanfordVL/arxivbot
def summarize(string, num_sentence=3):
    """
    Summarize a sentence with sumy
    """
    parser = PlaintextParser(string, tknz)
    parser.stop_word = get_stop_words(lang)
    summ_string = ''
    for sentence in summarizer(parser.document, num_sentence):
        summ_string += str(sentence) + ' '
    return summ_string
コード例 #14
0
def get_summary_per_section_lsa(cur_sents, each_summ_num):
    summarizer = LsaSummarizer()
    summarizer = LsaSummarizer(Stemmer(LANGUAGE))
    summarizer.stop_words = get_stop_words(LANGUAGE)
    parser = PlaintextParser(cur_sents, Tokenizer(LANGUAGE))
    summ = summarizer(parser.document, each_summ_num)
    decoded = []
    for line in summ:
        decoded.append(line._text)
    return decoded
コード例 #15
0
def get_summary(text):
    parser = PlaintextParser(text, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document,
                         3)  #Summarize the document with 5 sentences
    paragraph = ""
    for sentance in summary:
        paragraph += sentance._text
        paragraph += " "
    return paragraph
コード例 #16
0
def summary(text):

    stemmer = Stemmer(LANGUAGE)
    parser = PlaintextParser(text, Tokenizer(LANGUAGE))
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    short = ""
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        short = short + ">" + "* " + str(sentence).decode('ascii','ignore') + "\n\n"
        #print(sentence)
    return short
コード例 #17
0
def test_rouge_l_sentence_level():
    reference_text = "police killed the gunman"
    reference = PlaintextParser(reference_text,
                                Tokenizer("english")).document.sentences

    candidate1_text = "police kill the gunman"
    candidate1 = PlaintextParser(candidate1_text,
                                 Tokenizer("english")).document.sentences

    candidate2_text = "the gunman kill police"
    candidate2 = PlaintextParser(candidate2_text,
                                 Tokenizer("english")).document.sentences

    candidate3_text = "the gunman police killed"
    candidate3 = PlaintextParser(candidate3_text,
                                 Tokenizer("english")).document.sentences

    assert rouge_l_sentence_level(candidate1, reference) == approx(3 / 4)
    assert rouge_l_sentence_level(candidate2, reference) == approx(2 / 4)
    assert rouge_l_sentence_level(candidate3, reference) == approx(2 / 4)
コード例 #18
0
def summarize(text, text_language, num_sentences):
    parser = PlaintextParser(text, Tokenizer(text_language))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(text_language)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(text_language)
    summary = ''
    for sentence in summarizer(parser.document, num_sentences):
        summary += str(sentence)
    return summary
コード例 #19
0
ファイル: summarizers.py プロジェクト: surgoku/Summarizer
def get_summaries(article, topic):

    parser = PlaintextParser(article, Tokenizer(LANGUAGE))
    summarizer_list = [
        head_summarizer,
        LsaSummarizer,
        LexRankSummarizer,
        LuhnSummarizer,
        EdmundsonSummarizer,
    ]  #gensim_summarizer]  # , lex_rank]
    summarizers_name = ["Head Summarizer", "LSA", "LexRank", "Luhn",
                        "Edmund"]  #, "Gensim"]

    summary_dict = {}

    for Summarizer, name in zip(summarizer_list, summarizers_name):
        #rint(name, )
        summary = ''
        if Summarizer == head_summarizer:
            summary = head_summarizer(article)
        elif Summarizer == gensim_summarizer:
            ratio = 0.01
            while len(summary.split()) < 15:
                summary = Summarizer(" ".join(article.split('\n')),
                                     ratio=ratio)
                ratio = ratio * 5
            if len(summary.split()) > 40:
                summary = tokenize_article_to_sentences(summary)[0]

        elif Summarizer == EdmundsonSummarizer:
            summarizer = Summarizer()
            summarizer.bonus_words = topic.lower().split()
            summarizer.stigma_words = stop_words
            summarizer.null_words = stop_words

            for sentence in summarizer(parser.document, SENTENCES_COUNT):
                summary += str(sentence)

        else:
            summarizer = Summarizer(stemmer)
            summarizer.stop_words = stop_words
            for sentence in summarizer(parser.document, SENTENCES_COUNT):
                summary += str(sentence)

        #summary = re.sub("[\(\[].*?[\)\]]", "", summary)
        #summary = " ".join(summary.split())

        if '( listen)' or '(listen)' in summary:
            summary = re.sub('\( listen\)', ' ', summary)

        summary_dict[name] = summary

    return summary_dict
コード例 #20
0
def summarize(text, sentence_count, bonus_words, language='english'):
    '''

    '''
    summarizer = EdmundsonSummarizer(Stemmer(language))
    summarizer.stop_words = get_stop_words(language)
    summarizer.bonus_words = bonus_words
    summarizer.stigma_words = ['zdfgthdvndadv']
    summarizer.null_words = stopwords.words('english')
    summary = summarizer(
        PlaintextParser(text, Tokenizer(language)).document, sentence_count)
    return summary
コード例 #21
0
def lsa_summaries (filepath,word_count):
   with open(filepath, 'rb') as filehandle:
       texts_str = pickle.load(filehandle)
   lsa_summary = []
   for t in texts_str:
       parser = PlaintextParser(t, Tokenizer('english'))
       for i in range (len(t.split('.'))):
           summary,len_summary = lsa_summarizer(parser,i)
           if len_summary < word_count:
               continue
           else:
               break
       lsa_summary.append(summary)
   return lsa_summary
コード例 #22
0
    def test_rouge_l_sentence_level(self):
        reference_text = "police killed the gunman"
        reference = PlaintextParser(reference_text,
                                    Tokenizer("english")).document.sentences

        candidate1_text = "police kill the gunman"
        candidate1 = PlaintextParser(candidate1_text,
                                     Tokenizer("english")).document.sentences

        candidate2_text = "the gunman kill police"
        candidate2 = PlaintextParser(candidate2_text,
                                     Tokenizer("english")).document.sentences

        candidate3_text = "the gunman police killed"
        candidate3 = PlaintextParser(candidate3_text,
                                     Tokenizer("english")).document.sentences

        self.assertAlmostEqual(rouge_l_sentence_level(candidate1, reference),
                               3 / 4)
        self.assertAlmostEqual(rouge_l_sentence_level(candidate2, reference),
                               2 / 4)
        self.assertAlmostEqual(rouge_l_sentence_level(candidate2, reference),
                               2 / 4)
コード例 #23
0
def summarize(text,
              summarizer,
              sentence_count,
              bonus_words=['MLK, rights'],
              language='english'):
    summarizer = summarizer(Stemmer(language))
    summarizer.stop_words = get_stop_words(language)
    if isinstance(summarizer, EdmundsonSummarizer):
        summarizer.bonus_words = bonus_words
        summarizer.stigma_words = ['zdfgthdvndadv']
        summarizer.null_words = summarizer.stop_words
    summary = summarizer(
        PlaintextParser(text, Tokenizer(language)).document, sentence_count)
    return summary
コード例 #24
0
def summarizer(sentences):
    '''
    imput: sentences
    output: file list
    Given the path to a directory returns the video files in that directory (end with .mp4)
    '''
    stemmer = Stemmer(LANGUAGE)
    parser = PlaintextParser(text, Tokenizer(LANGUAGE))
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    # creates a list of the sentences in the summary
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        summary_list.append(str(sentence))
コード例 #25
0
def blog_to_sum(tech, blog):
    docparser = PlaintextParser(blog, Tokenizer("english"))
    if tech == 'lsa':
        results = lsaa(docparser)
    elif tech == 'luhn':
        results = luhnn(docparser)
    elif tech == 'klsum':
        results = klsumm(docparser)
    elif tech == 'textrank':
        results = textrankk(docparser)
    elif tech == 'lexrank':
        results = lexrankk(docparser)

    return results
コード例 #26
0
def textrank_test(doc_dir,
                  ref_dir,
                  summary_length=75,
                  use_stop_words=True,
                  use_lemmatizer=True):
    """
    Args:
        doc_dir (str): Input chapters directory
        ref_dir (str): Reference summaries directory
        summary_length (int) : Length of summary
    """
    docs = sorted(os.listdir(doc_dir))
    refs = sorted(os.listdir(ref_dir))

    documents = []
    references = []
    for d, r in zip(docs, refs):
        doc, ref = Utils.load_document(doc_dir + "/" + d, ref_dir + "/" + r)
        p_doc_headings = Utils.process_document(doc, use_stop_words,
                                                use_lemmatizer)
        p_doc = Utils.remove_headings(p_doc_headings)
        p_ref = Utils.process_document(ref, use_stop_words, use_lemmatizer)
        documents.append(p_doc)
        references.append(p_ref)  #Processed reference

    references = Utils.join_docs(references)
    rouge_scores = [0.0] * len(documents)
    rogue_index = 0
    for d, r in zip(docs, refs):
        # Perform Textrank
        text = 'ред '.join(' '.join(item) for item in documents[rogue_index])
        parser = PlaintextParser(text, Utils.Tokenizer())  # Processed input
        summarizer = TextRankSummarizer()
        '''if stopwords:
            summarizer.stop_words = Utils.load_stop_words()'''
        summary = summarizer(parser.document, summary_length)
        p_sum = ""
        with open("textrank_summary_" + d, 'w',
                  encoding="utf-8") as output_file:
            for sentence in summary:
                output_file.write(str(sentence) + "ред\n")
                p_sum += str(sentence) + " "
        rouge_scores[rogue_index] = Utils.calculate_rouge(
            p_sum, [references[rogue_index]], 1)
        rogue_index += 1

    print(rouge_scores)
コード例 #27
0
ファイル: url.py プロジェクト: yashsingh96/ShortTerm
def summarize(selected_text, n=3):
    from sumy.parsers.plaintext import PlaintextParser  #We're choosing a plaintext parser here, other parsers available for HTML etc.
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.summarizers.lex_rank import LexRankSummarizer  #We're choosing Lexrank, other algorithms are also built in

    output = ''

    parser = PlaintextParser(selected_text, Tokenizer("english"))
    summarizer = LexRankSummarizer()

    summary = summarizer(parser.document,
                         n)  #Summarize the document with 2 sentences

    for sentence in summary:
        output += str(sentence)

    return output
コード例 #28
0
ファイル: ml.py プロジェクト: divyamsingh13/ResearchPaper
    def summarize4(self, df):
        #http://ai.intelligentonlinetools.com/ml/text-summarization/
        LANGUAGE = "english"
        SENTENCES_COUNT = 10
        stopwords = nltk.corpus.stopwords.words('english')
        for row in df['conclusion']:
            if row == '0' or row == '':
                continue
            parser = PlaintextParser(row, Tokenizer(LANGUAGE))
            print("--LsaSummarizer--")
            summarizer = LsaSummarizer()
            summarizer = LsaSummarizer(Stemmer(LANGUAGE))
            summarizer.stop_words = get_stop_words(LANGUAGE)
            for sentence in summarizer(parser.document, SENTENCES_COUNT):
                print(sentence)

            print("--LuhnSummarizer--")
            summarizer = LuhnSummarizer()

            summarizer.stop_words = stopwords
            for sentence in summarizer(parser.document, SENTENCES_COUNT):
                print(sentence)

            print("--EdmundsonSummarizer--")
            summarizer = EdmundsonSummarizer()
            words = ("deep", "learning", "neural")
            summarizer.bonus_words = words

            words = (
                "another",
                "and",
                "some",
                "next",
            )
            summarizer.stigma_words = words

            words = (
                "another",
                "and",
                "some",
                "next",
            )
            summarizer.null_words = words
            for sentence in summarizer(parser.document, SENTENCES_COUNT):
                print(sentence)
コード例 #29
0
def apply(text, interest, top_k=5):
    """
    Return and tuple of list of tuple.
        The first list contains all the sentence about the interest and there corresponding weigth in the document
        The second list contains the top_k sentences of the document
    """
    LANGUAGE = "english"
    parser = PlaintextParser(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    sent_importance = summarizer.rate_sentences(parser.document)
    interesting_sent = []
    for sent in sent_importance:
        if interest.lower() in sent._text.lower():
            interesting_sent.append((sent._text, sent_importance[sent]))
    top_sent = summarizer(parser.document, top_k)
    top_sent = [(s._text, sent_importance[s]) for s in top_sent]
    return (interesting_sent, top_sent)
コード例 #30
0
def summarizer(algo):
    text = excel_preprocessor()
    if algo is 'gensimTextRank':
        text_summarized = summarize(text, word_count=5000)
        return text_summarized
    elif algo is 'sumyTextRank':
        parser = PlaintextParser(text, Tokenizer('english'))
        stemmer = Stemmer('english')
        text_summarizer = TextRankSummarizer(stemmer)
        text_summarizer.stop_words = get_stop_words('english')
        summarized = text_summarizer(parser.document, 25)
        sent = []
        for sentence in summarized:
            sent.append(str(sentence))
        text_summarized = "\n\n".join(sent)
        return text_summarized
    else:
        msg = "Accepted Inputs are: \n1. gensimTextRank\n2. sumyTextRank"
        print(msg)