Ejemplo n.º 1
0
    def test_dictionary_without_stop_words(self):
        summarizer = LsaSummarizer()
        summarizer.stop_words = ["stop", "Halt", "SHUT", "HmMm"]

        document = build_document(
            ("stop halt shut hmmm", "Stop Halt Shut Hmmm",),
            ("StOp HaLt ShUt HmMm", "STOP HALT SHUT HMMM",),
            ("Some relevant sentence", "Some moRe releVant sentEnce",),
        )

        expected = frozenset(["some", "more", "relevant", "sentence"])
        dictionary = summarizer._create_dictionary(document)
        self.assertEqual(expected, frozenset(dictionary.keys()))
    def test_dictionary_without_stop_words(self):
        summarizer = LsaSummarizer()
        summarizer.stop_words = ["stop", "Halt", "SHUT", "HmMm"]

        document = build_document(
            ("stop halt shut hmmm", "Stop Halt Shut Hmmm",),
            ("StOp HaLt ShUt HmMm", "STOP HALT SHUT HMMM",),
            ("Some relevant sentence", "Some moRe releVant sentEnce",),
        )

        expected = frozenset(["some", "more", "relevant", "sentence"])
        dictionary = summarizer._create_dictionary(document)
        self.assertEqual(expected, frozenset(dictionary.keys()))
Ejemplo n.º 3
0
def LSAPlus_SumPlus(doc):

    # SumPlus
    sumbasic_sents = []
    for text in doc:
        tsummarizer_w_stops = SumBasicSummarizer()
        tsummarizer_w_stops.stop_words = get_stop_words('english')
        parser = PlaintextParser.from_string(text, Tokenizer('english'))
        dictionary = tsummarizer_w_stops._compute_ratings(
            parser.document.sentences)
        sumbasic_sents_entries = []
        for sent in dictionary:
            sumbasic_sents_entries.append(sent)
        sumbasic_sents.append(sumbasic_sents_entries)

    #LSAPlus

    lsa_sents = []
    for text in doc:
        l2summarizer = LsaSummarizer()
        parser = PlaintextParser.from_string(text, Tokenizer('english'))
        dictionary = (l2summarizer._create_dictionary(parser.document))
        matrix = l2summarizer._create_matrix(parser.document, dictionary)
        matrix2 = l2summarizer._compute_term_frequency(matrix)
        u, sigma, v = singular_value_decomposition(matrix2,
                                                   full_matrices=False)
        v_sorted = sorted(abs(v[:, 0]), reverse=True)
        v_indices = []
        for i in v_sorted:
            v_indices.append(list(v_sorted).index(i))

        sents = np.array(list(parser.document.sentences))
        sents[np.array(v_indices)]
        lsa_sents_entries = list(sents)
        lsa_sents.append(lsa_sents_entries)

    # Combining SumPlus and LSAPlus
    import math
    num_sentences = len(sumbasic_sents)
    all_sents_removed_parent2 = []
    for entry in range(num_sentences):
        num_sents_to_remove = math.ceil(len(sumbasic_sents[entry]) / 2)
        sent_len = len(sumbasic_sents[entry])
        sb = sumbasic_sents[entry][sent_len - num_sents_to_remove:sent_len]
        lsa = lsa_sents[entry][sent_len - num_sents_to_remove:sent_len]

        # Checking if Sentences are ranked bad by BOTH LSAPlus and SumPlus
        sents_removed3 = []
        for sent in lsa:
            if (sent in sb):
                sents_removed3.append(sent)

        # Setences to be Trimmed Off
        all_sents_removed_parent2.append(sents_removed3)

    sents_to_keep_parent2 = []
    for i in range(len(doc)):
        parser = PlaintextParser.from_string(doc[i], Tokenizer('english'))
        sents = parser.document.sentences

        # Sentences not Trimmed Off
        sents_to_keep2 = [
            sentence for sentence in sents
            if sentence not in all_sents_removed_parent2[i]
        ]

        # Appending Trimmed Text for Each Entry
        sents_to_keep_parent2.append(sents_to_keep2)

    # Trimmed Text
    sentence_parent2 = []
    for text in sents_to_keep_parent2:
        sentence = ""
        for sent in text:
            sentence = sentence + " " + str(sent)
        sentence_parent2.append(sentence)

    return sentence_parent2
Ejemplo n.º 4
0
class SpreadShredProcessor(Processor):
    separator = "<:>"

    def __init__(self, input_texts: str):
        nltk.download('punkt')

        self.nlp = spacy.load('en_core_web_lg')

        self.summarizer = LsaSummarizer(Stemmer('english'))
        self.summarizer.stop_words = get_stop_words('english')

        self.cleaner = CleaningProcessor()

        self.synonyms: Dict[str, Optional[List[str]]] = {}
        if path.isfile('src/syns.yaml'):
            with open('src/syns.yaml', 'r') as f:
                self.synonyms = yaml.safe_load(f)

        if self.synonyms is None:
            self.synonyms = {}

        self.patterns: Dict[str, str] = OrderedDict()
        self.rev_patterns: Dict[str, str] = OrderedDict()

        with open('src/spreadr_shreddr/data.yaml', 'r') as f:
            data = yaml.safe_load(f)

        self.patterns.update(data['shorten'])
        self.patterns.update(data['expand'])

        data['filler'].extend(
            pycorpora.get_file('humans', 'prefixes')['prefixes'])

        self.patterns.update({k: '' for k in data['filler']})

        for obj in pycorpora.get_file('words', 'compounds')['compounds']:
            key = '{} {}'.format(obj['firstWord'], obj['secondWord'])
            if key not in self.patterns:
                self.patterns[key] = obj['compoundWord']

        self.patterns.update(
            {k.capitalize(): v.capitalize()
             for k, v in self.patterns.items()})

        self.brits = data['brit_am']
        self.murcans = {v: k for k, v in self.brits.items()}

        changed = False
        api = Datamuse()
        for text in input_texts:
            text >>= self.cleaner

            for sent in sent_tokenize(text):
                for index, word in enumerate(self.nlp(sent)):
                    orth = word.orth_.lower()
                    key = self.separator.join((orth, word.tag_))

                    if key not in self.synonyms:
                        changed = True
                        syns: List[str] = []

                        if (word.pos_ in UNIVERSAL_TO_DATAMUSE
                                and len(wn.synsets(orth)) <= 1):
                            res = api.words(ml=orth)

                            if len(res) > 0:
                                syns = self._get_synonyms(
                                    ' '.join(sent), (index, word), res)

                        if len(syns) > 1:
                            self.synonyms[key] = syns
                        else:
                            self.synonyms[key] = None

                    if changed:
                        changed = False
                        with open('src/syns.yaml', 'a') as f:
                            f.write(yaml.dump({key: self.synonyms[key]}))

    def _get_synonyms(self, sentence: str, word: Tuple[int, Token],
                      candidates: list) -> List[str]:
        def tagged(x):
            return ('tags' in x and 'syn' in x['tags']
                    and 'prop' not in x['tags'] and word_count(x['word']) == 1)

        def match_pos(x):
            return (word[1].tag_ == self.nlp(
                sentence.replace(word[1].orth_, x['word']))[word[0]].tag_
                    and UNIVERSAL_TO_DATAMUSE[word[1].pos_] in x['tags'])

        return [word[1].orth_] + [
            obj['word'] for obj in candidates if tagged(obj) and match_pos(obj)
        ]

    def replace_synonym(self, word: Token,
                        desired_char_change: int) -> Tuple[str, int]:
        key = self.separator.join((word.orth_.lower(), word.tag_))

        if key in self.synonyms and self.synonyms[key] is not None:
            length = len(word.orth_)
            ideal = min(
                cast(List[str], self.synonyms[key]),
                key=lambda x: abs(desired_char_change + length - len(x)))

            return (ideal, desired_char_change + length - len(ideal))

        return (word.orth_, desired_char_change)

    def brit_am(self, word: Token,
                desired_char_change: int) -> Tuple[str, int]:
        length = len(word.orth_)
        ideal = word.orth_

        if desired_char_change < 0 and word.orth_ in self.brits:
            ideal = self.brits[word.orth_]

        if desired_char_change > 0 and word.orth_ in self.murcans:
            ideal = self.murcans[word.orth_]

        return (ideal, desired_char_change + length - len(ideal))

    def summarize(self, excerpt: str, len_s: int) -> str:
        parser = PlaintextParser.from_string(excerpt, Tokenizer('english'))

        document = parser.document
        dictionary = self.summarizer._create_dictionary(document)

        if dictionary is None:
            return excerpt

        words_count = len(dictionary)
        sentences_count = len(document.sentences)
        if words_count < sentences_count:
            return excerpt

        sents = self.summarizer(parser.document, len_s)

        return ' '.join(str(s) for s in sents)

    def process_text(self, input_text: str, **kwargs) -> str:
        cleaned_text = input_text >> self.cleaner

        # Setup
        char_length = kwargs.get('char_length', None)

        dchar_change = 0
        if char_length is not None:
            dchar_change = char_length - len(cleaned_text)

        word_length = kwargs.get('word_length', None)

        dword_change = 0
        if word_length is not None:
            dword_change = word_length - word_count(cleaned_text)
            print('a', dword_change)

        # Summarize paragraphs
        if dword_change < 0:
            paragraphs = {
                x: word_count(x)
                for x in (p >> self.cleaner for p in input_text.split('\n\n'))
            }
            pgraph_keys = sorted(
                paragraphs,
                key=lambda x: len(
                    PlaintextParser.from_string(x, Tokenizer('english')).
                    document.sentences),
                reverse=True)

            ideals: List[Tuple[int, str, str]] = []
            d = {}
            for len_s in range(
                    1,
                    len(
                        PlaintextParser.from_string(
                            pgraph_keys[0],
                            Tokenizer('english')).document.sentences)):
                for p in pgraph_keys:
                    if len(
                            PlaintextParser.from_string(
                                p, Tokenizer(
                                    'english')).document.sentences) <= len_s:
                        break

                    repl = self.summarize(p, len_s)

                    if repl.count('"') % 2 != 0:
                        continue

                    diff = word_count(repl) - paragraphs[p]

                    if diff == 0:
                        continue

                    d[repl] = p

                    ideals.append((diff, p, repl))

            # Check paragraph combinations
            possible = list_sums((x[0] for x in ideals), dword_change)
            if len(possible) > 0:
                for diff, p, repl in min(possible, key=len):
                    cleaned_text = cleaned_text.replace(p, '{}'.format(repl))

                    dword_change -= diff

                    if char_length is not None:
                        dchar_change -= len(repl) - len(p)
            else:
                excluded: Set[str] = set()
                for diff, p, repl in sorted(
                        ideals,
                        key=lambda x: abs(dword_change - x[0]),
                        reverse=True):
                    if (abs(dword_change - diff) < abs(dword_change)
                            and p not in excluded):
                        excluded.add(p)

                        cleaned_text = cleaned_text.replace(
                            p, '{}'.format(repl))

                        dword_change -= diff

                        if char_length is not None:
                            dchar_change -= len(repl) - len(p)

        def space(x):
            return WHITESPACE_PATTERN.sub(' ', ' {} '.format(x))

        # Patterns for word count
        done: Set[str] = set()
        while dword_change < 0 and len(done) != len(self.patterns):
            done.clear()
            for k, v in self.patterns.items():
                if dword_change == 0 and dchar_change == 0:
                    print('b', dword_change)
                    return cleaned_text

                if v == '':
                    diff = -word_count(k)
                else:
                    diff = word_count(v) - word_count(k)

                if (abs(dword_change - diff) < abs(dword_change)
                        and cleaned_text.find(space(k)) != -1):
                    cleaned_text = cleaned_text.replace(space(k), space(v), 1)

                    dword_change -= diff
                    if char_length is not None:
                        dchar_change -= len(v) - len(k)

                else:
                    done.add(k)

        # Synonyms and spellings for char count
        for word in self.nlp(cleaned_text):
            if dword_change == 0 and dchar_change == 0:
                print('c', dword_change)
                return cleaned_text

            while dchar_change != 0 and cleaned_text.find(space(
                    word.orth_)) != -1:
                (repl, dchar_change) = self.replace_synonym(word, dchar_change)
                cleaned_text = cleaned_text.replace(space(word.orth_),
                                                    space(repl), 1)

            while dchar_change != 0 and cleaned_text.find(space(
                    word.orth_)) != -1:
                (repl, dchar_change) = self.brit_am(word, dchar_change)
                cleaned_text = cleaned_text.replace(space(word.orth_),
                                                    space(repl), 1)

        print('d', dword_change)
        return cleaned_text