Ejemplo n.º 1
0
    def _summ(self, text, n_sentences, model='textrank-g'):
        """ This function creates a summary for the specified model. If no model is given, textrank with gensim
            (textrank-g) is computed by default.

            Args:
            -----
                text (string)      >>> Text that we want to summarize.

                n_sentences (int)  >>> Optional. Used by sumy algorithms. Number of sentences chosen for the summary.

                models (string)    >>> Optional. Model used to summarize.
                                       Default is set to textrank-g.

            Returns:
            --------
                String with the summary for the choosen model.
        """

        lang = Summarization.get_lang(text)

        parser = PlaintextParser.from_string(text, Tokenizer(lang))

        if (model == 'lexrank-s'):

            summarizer = LexRankSummarizer()
            summary = summarizer(parser.document, n_sentences)
            summary = [str(sentence) for sentence in summary]
            summary_p = ' '.join(summary)

        elif (model == 'textrank-s'):

            summarizer = TextRankSummarizer()
            summary = summarizer(parser.document, n_sentences)
            summary = [str(sentence) for sentence in summary]
            summary_p = ' '.join(summary)

        elif (model == 'lsa-s'):

            summarizer = LsaSummarizer()
            summary = summarizer(parser.document, n_sentences)
            summary = [str(sentence) for sentence in summary]
            summary_p = ' '.join(summary)

        elif (model == 'luhn-s'):

            summarizer = LuhnSummarizer()
            summary = summarizer(parser.document, n_sentences)
            summary = [str(sentence) for sentence in summary]
            summary_p = ' '.join(summary)

        else:

            total_number_of_sentences = Summarization.count_sentences(text)
            ratio = n_sentences / total_number_of_sentences

            summary_p = gensim_summarize(text, ratio=ratio).replace('\n', ' ')
            return summary_p

        return summary_p
Ejemplo n.º 2
0
def text_summary(select_func, title_holder):
    title_holder.markdown('# ' + select_func.split()[-1].upper())
    st.info('Raw content')
    raw_content = st.empty()
    demo_checkbox = st.sidebar.checkbox('Show news demo')

    st.sidebar.subheader('Input your texts here')
    content = st.sidebar.text_area('Click outside textarea after input',
                                   'Texts need to be summarized.')

    st.sidebar.subheader('Method to compute similarity')
    metric_radio = st.sidebar.radio('', ('Co-occur', 'Cosine'))
    if metric_radio == 'Cosine':
        metric = cosine
        st.sidebar.selectbox('Method to compute sentence embedding',
                             ('SIF', 'Average Embedding'))
    else:
        metric = similarity_with_coocurr

    if demo_checkbox:
        content = news_demo

    if content != 'Texts need to be summarized.':
        raw_content.markdown(content)
        summary = process_pipe(content, metric, tokens_counter, fasttext)
        st.success(f'Summarized content with {metric.__name__}')
        st.write(summary)

        st.warning(f'Summarized content with gensim API - Unstable')
        try:
            st.write(gensim_summarize(content))
        except ValueError as e:
            st.error(e)

    st.sidebar.subheader('Keywords')
    show_kw = st.sidebar.checkbox('Show Keywords')
    if show_kw:
        kw_length = st.sidebar.slider(label='Keywords Number',
                                      min_value=3,
                                      max_value=10)
        kw_method = st.sidebar.selectbox(
            'Method to extract keywords',
            ('TextRank', 'TextRank API', 'Tf-idf API'))
        if kw_method == 'TextRank':
            keywords = extract_keyword(co_occurrence(content, window_size=2),
                                       topk=kw_length)
        elif kw_method == 'TextRank API':
            keywords = textrank(content, topK=kw_length)
        else:
            keywords = extract_tags(content, topK=kw_length)
        st.sidebar.info('\t'.join(keywords))

        content = pretty_output(content, keywords)
    raw_content.markdown(content, unsafe_allow_html=True)
    bokeh_figure()
Ejemplo n.º 3
0
    def gensim_textrank(self):
        hypes = []
        refs = []
        refs_dirty = []
        for item in self.items:
            if item.ref:

                if item.quote.ntokens <= self.mean_ntokens or \
                        item.quote.nsents == 1:
                    hypes.append(item.quote.cleaned_text)
                    refs.append(item.ref.cleaned_text)
                    refs_dirty.append(item.ref.original)
                    # print(item.quote.cleaned_text)
                else:
                    # for x in range(2, 11):
                    #     ratio = x * .1
                    hyp = gensim_summarize("\n ".join(item.quote.clean_sents),
                                           word_count=self.mean_ntokens)
                    if len(hyp) > 0:
                        hypes.append(hyp)
                        refs.append(item.ref.cleaned_text)
                        score = rouge.get_scores(hyp, item.ref.cleaned_text)
                        # print(f"{item.uid}\n \t {score}\n \t Hyp: {hyp} \n \t Ref: {item.ref}")

                    elif item.quote.ntokens / item.quote.nsents > self.mean_ntokens:
                        hyp = gensim_summarize("\n ".join(
                            item.quote.clean_sents),
                                               word_count=self.max_ntokens)
                        if len(hyp) > 0:
                            hypes.append(hyp)
                            refs.append(item.ref.cleaned_text)
                    else:
                        print(
                            f"No summary for {item.uid} | {int(item.quote.ntokens/item.quote.nsents)}\t| {item.quote.cleaned_text}"
                        )

                        # break

        hypes = [x.lower() for x in hypes]
        refs = [x.lower() for x in refs]

        return rouge.get_scores(hypes, refs, avg=True)
Ejemplo n.º 4
0
def summarize(text_to_summarize):
    result = {'success': True, 'summary': '', 'error': ''}

    try:
        summary = gensim_summarize(text_to_summarize)

        if len(summary) == 0:
            raise Exception('Gensim needs more text in order to summarize.')

        result['summary'] = summary

    except Exception as exception:
        result['success'] = False
        result['error'] = str(exception)

    return result
Ejemplo n.º 5
0
    def _createSummaryWithGensim(self, doc):
        #
        # We use Gensim TextRank for extractive text summarization.
        # TODO: Explore other algorithms and options, filter candidate sentences,
        # use spaCy sentencizer for better results  etc...
        text = str(
            doc.text
        )  # "".join([str(s.text) for s in self.filter_for_summarize(doc.sents)])

        summarySentences = gensim_summarize(
            text=text,
            # ratio=0.1,
            word_count=200,
            split=True,
        )

        return summarySentences
Ejemplo n.º 6
0
def generate_summary_gensim(text, size_summary=200):
    """
    Performs text summary from gensim summarizer

    Parameters
    ----------
    text: list
        A list of sentences

    Returns
    -------
    summary: list
        List with each item being a single sentence

    """
    text = preprocess_corpus(text)
    text = ". ".join(text)
    return gensim_summarize(text, word_count=size_summary, split=True)
Ejemplo n.º 7
0
 def summarize(self, text: str):
     return gensim_summarize(text, word_count=self.word_count)
Ejemplo n.º 8
0
 def summarize(self, text, **kwargs):
     try:
         summarized = gensim_summarize(text, **kwargs)
         return summarized
     except:
         return text