Example #1
0
 def _summarize(self, string, mode, postprocess, **kwargs):
     summary = upperfirst(self._predict(f'{mode}: {cleaning(string)}'))
     if postprocess:
         summary = filter_rouge(string, summary, **kwargs)
         summary = postprocessing_summarization(summary)
         summary = find_lapor_and_remove(string, summary)
     return summary
Example #2
0
    def _summarize(
        self,
        strings,
        mode,
        top_p=0.7,
        temperature=1.0,
        postprocess=True,
        **kwargs,
    ):
        mode = mode.lower()
        if mode not in ['ringkasan', 'tajuk']:
            raise ValueError('mode only supports [`ringkasan`, `tajuk`]')

        strings_ = [f'{mode}: {cleaning(string)}' for string in strings]
        batch_x = [self._tokenizer.encode(string) + [1] for string in strings_]
        batch_x = pad_sequences(batch_x, padding='post', maxlen=self._maxlen)

        r = self._execute(
            inputs=[batch_x, top_p, temperature],
            input_labels=['Placeholder', 'top_p', 'temperature'],
            output_labels=['logits'],
        )
        p = r['logits'].tolist()

        results = []
        for no, r in enumerate(p):
            summary = self._tokenizer.decode(r)
            if postprocess and mode != 'tajuk':
                summary = filter_rouge(strings[no], summary, **kwargs)
                summary = postprocessing_summarization(summary)
                summary = find_lapor_and_remove(strings[no], summary)

            results.append(summary)

        return results
Example #3
0
    def _summarize(
        self,
        strings,
        mode,
        decoder='greedy',
        top_p=0.7,
        postprocess=True,
        **kwargs,
    ):
        mode = mode.lower()
        if mode not in ['ringkasan', 'tajuk']:
            raise ValueError('mode only supports [`ringkasan`, `tajuk`]')

        if not 0 < top_p < 1:
            raise ValueError('top_p must be bigger than 0 and less than 1')

        decoder = decoder.lower()
        if decoder not in ['greedy', 'beam', 'nucleus']:
            raise ValueError(
                'mode only supports [`greedy`, `beam`, `nucleus`]')

        strings_ = [f'{mode}: {cleaning(string)}' for string in strings]

        batch_x = [self._tokenizer.encode(string) + [1] for string in strings_]
        batch_x = padding_sequence(batch_x)

        r = self._execute(
            inputs=[batch_x, top_p],
            input_labels=['Placeholder', 'Placeholder_2'],
            output_labels=[decoder],
        )
        p = r[decoder].tolist()

        results = []
        for no, r in enumerate(p):
            summary = self._tokenizer.decode(r)
            if postprocess and mode != 'tajuk':
                summary = filter_rouge(strings[no], summary, **kwargs)
                summary = postprocessing_summarization(summary)
                summary = find_lapor_and_remove(strings[no], summary)

            results.append(summary)

        return results
Example #4
0
    def _summarize(
        self,
        strings,
        mode,
        decoder='greedy',
        top_p=0.7,
        postprocess=True,
        **kwargs,
    ):
        mode = mode.lower()
        if mode not in ['ringkasan', 'tajuk']:
            raise ValueError('mode only supports [`ringkasan`, `tajuk`]')

        if not 0 < top_p < 1:
            raise ValueError('top_p must be bigger than 0 and less than 1')

        decoder = decoder.lower()
        output = self._mapping.get(decoder)
        if not decoder:
            raise ValueError(
                'mode only supports [`greedy`, `beam`, `nucleus`]')

        strings_ = [f'{mode}: {cleaning(string)}' for string in strings]

        batch_x = [self._tokenizer.encode(string) + [1] for string in strings_]
        batch_x = padding_sequence(batch_x)

        p = self._sess.run(output,
                           feed_dict={
                               self._X: batch_x,
                               self._top_p: top_p
                           }).tolist()

        results = []
        for no, r in enumerate(p):
            summary = self._tokenizer.decode(r)
            if postprocess:
                summary = filter_rouge(strings[no], summary, **kwargs)
                summary = postprocessing_summarization(summary)
                summary = find_lapor_and_remove(strings[no], summary)

            results.append(summary)

        return results