def summarize(text):
    # SPLIT TO PARAGRAPHS
    pre_paragraphs = text.split('\n')
    paragraphs = []
    for i, p in enumerate(pre_paragraphs):
        if not re.match(r'^\s*$', p) and (i == len(pre_paragraphs) - 1 or re.match(r'^\s*$', pre_paragraphs[i+1])):
            paragraphs.append(p)

    # SPLIT TO SENTENCES
    sentences = separator.separate(text)
    print(f'Num of sentences: {len(sentences)}')
    for i, s in enumerate(sentences):
        print(f'#{i+1}: {s}')

    # TOKENIZE
    stem = False
    if stem:
        tokenized_sentences = [[czech_stemmer.cz_stem(word, aggressive=True) for word in sentence]
                               for sentence in tokenize(sentences)]
    else:
        tokenized_sentences = tokenize(sentences)

    # REMOVE STOPWORDS
    tokenized_sentences_without_stopwords = remove_stop_words(tokenized_sentences, keep_case=False)
    sentences_without_stopwords_case = remove_stop_words(sentences, keep_case=True, is_tokenized=False,
                                                         return_tokenized=False)
    print('===Sentences without stopwords===')
    for i, s in enumerate(tokenized_sentences_without_stopwords):
        print(f'''#{i+1}: {' '.join(s)}''')

    print('===Sentences without stopwords CASE===')
    for i, s in enumerate(sentences_without_stopwords_case):
        print(f'''#{i+1}: {s}''')

    # POS-TAG
    tagged_sentences = pos_tag(sentences_without_stopwords_case)
    print('=====Tagged_sentences=====')
    for i, s in enumerate(tagged_sentences):
        print(f'''#{i+1}: {s}''')

    summary = ''
    counter = 0
    summary_length = max(min(round(len(sentences) / 4), 15), 3)  # length between 3-15 sentences
    ranked_sentence_indexes = textrank(tokenized_sentences_without_stopwords, stopwords=[], top_n=summary_length)
    print(f'ranked_sentence_indexes: {ranked_sentence_indexes}')
    # add 1st sentence always
    summary += f'{sentences[0]}\n'
    counter += 1
    ranked_sentence_indexes.remove(0)
    # add also 2nd sentence if it is in top50%
    if 1 in ranked_sentence_indexes[:len(ranked_sentence_indexes) // 2]:
        summary += f'{sentences[1]}\n'
        counter += 1
        ranked_sentence_indexes.remove(1)
    for sentence_index in sorted(ranked_sentence_indexes[:summary_length - counter]):
        if counter == summary_length:
            break
        summary += f'{sentences[sentence_index]}\n'
        counter += 1
    return summary
Beispiel #2
0
def preprocess_sentences(loaded_sentences):
    sentences = []
    #for i in range(len(loaded_sentences)):
    # LOAD ONLY 100 TEXTS FOR TESTING PURPOSE
    for i in range(len(loaded_sentences)):
        #for i in range(1000):
        splitted = loaded_sentences[i].split()
        preprocessed_sentence = []
        for j in range(len(splitted)):
            formatted = to_string(splitted[j])
            without_stopwords = remove_stopwords(formatted)
            if (without_stopwords == ''):
                continue
            # stemming can be agressive or not according to flag
            stemmed = stem.cz_stem(without_stopwords)
            preprocessed_sentence.append(stemmed)
        sentences.append(preprocessed_sentence)
    return sentences
    def add_post(self, post):
        id = int(post[0])
        text = post[3]
        tokens = self.tokenizer.tokenize(text)

        self.tf[id] = {}

        for t in tokens:
            t = t.lower()
            # skip short words or stopwords
            if len(t) < 3 or t in self.stopwords:
                continue
            t = czech_stemmer.cz_stem(t)

            self.tf[id][t] = self.tf[id].get(t, 0.0) + 1.0
            if self.tf[id][t] == 1.0:
                self.corpus[t] = self.corpus.get(t, 0.0) + 1.0

        for t in self.tf[id]:
            self.tf[id][t] /= float(len(tokens))
Beispiel #4
0
def preprocess_sentences(loaded_sentences,
                         use_stopwords=True,
                         use_stemm=False):
    sentences = []
    for i in range(len(loaded_sentences)):
        splitted = loaded_sentences[i].split()
        preprocessed_sentence = []
        for j in range(len(splitted)):
            formatted = to_string(splitted[j])
            without_stopwords = formatted
            if use_stopwords:
                without_stopwords = remove_stopwords(formatted)
            if (without_stopwords == ''):
                continue
            # stemming can be agressive or not according to flag
            stemmed = without_stopwords
            if use_stemm:
                stemmed = stem.cz_stem(stemmed)
            preprocessed_sentence.append(stemmed)
        sentences.append(preprocessed_sentence)
    return sentences
Beispiel #5
0
def normalize_text(texts, stops):
    # Lower case
    texts = [x.lower() for x in texts]

    # Remove punctuation
    texts = [
        ''.join(c for c in x if c not in string.punctuation) for x in texts
    ]

    # Remove numbers
    texts = [''.join(c for c in x if c not in '0123456789') for x in texts]

    # Remove stopwords
    texts = [
        ' '.join([
            czech_stemmer.cz_stem(word) for word in x.split()
            if word not in (stops)
        ]) for x in texts
    ]

    # Trim extra whitespace
    texts = [' '.join(x.split()) for x in texts]

    return (texts)
Beispiel #6
0
    print('rs', metrics.adjusted_rand_score(lemmas, forms))
    #print('mi', metrics.mutual_info_score(lemmas, forms),
    #        metrics.adjusted_mutual_info_score(lemmas, forms),
    #        metrics.normalized_mutual_info_score(lemmas, forms))


print('stem5')
stem5s = list()
for form in forms:
    stem5s.append(form[:5])
measure(lemmas, stem5s)

print('czstem light')
czstem = list()
for form in forms:
    czstem.append(cz_stem(form, aggressive=False))
measure(lemmas, czstem)

print('czstem aggressive')
czstem = list()
for form in forms:
    czstem.append(cz_stem(form, aggressive=True))
measure(lemmas, czstem)

print('lemma is form')
measure(lemmas, forms)

print('gold')
measure(lemmas, lemmas)

import random
def summarize(text):
    # SPLIT TO PARAGRAPHS
    pre_paragraphs = text.split('\n')
    paragraphs = []
    for i, p in enumerate(pre_paragraphs):
        if not re.match(r'^\s*$', p) and (i == len(pre_paragraphs) - 1
                                          or re.match(r'^\s*$',
                                                      pre_paragraphs[i + 1])):
            paragraphs.append(p)
    # print(f'Num of paragraphs: {len(paragraphs)}')
    # for i, p in enumerate(paragraphs):
    #     print(f'par#{i+1}: {p}')

    # SPLIT TO SENTENCES
    sentences = separator.separate(text)
    print(f'Num of sentences: {len(sentences)}')
    for i, s in enumerate(sentences):
        print(f'#{i+1}: {s}')

    # TOKENIZE
    stem = False
    if stem:
        tokenized_sentences = [[
            czech_stemmer.cz_stem(word, aggressive=False) for word in sentence
        ] for sentence in tokenize(sentences)]
    else:
        tokenized_sentences = tokenize(sentences)

    # REMOVE STOPWORDS
    tokenized_sentences_without_stopwords = remove_stop_words(
        tokenized_sentences, keep_case=False)
    sentences_without_stopwords_case = remove_stop_words(
        sentences, keep_case=True, is_tokenized=False, return_tokenized=False)
    print('===Sentences without stopwords===')
    for i, s in enumerate(tokenized_sentences_without_stopwords):
        print(f'''#{i+1}: {' '.join(s)}''')

    print('===Sentences without stopwords CASE===')
    for i, s in enumerate(sentences_without_stopwords_case):
        print(f'''#{i+1}: {s}''')

    # POS-TAG
    tagged_sentences = pos_tag(sentences_without_stopwords_case)
    print('=====Tagged_sentences=====')
    for i, s in enumerate(tagged_sentences):
        print(f'''#{i+1}: {s}''')

    # 1. THEMATICITY FEATURE
    thematicity_feature_scores = thematicity_feature(
        tokenized_sentences_without_stopwords)

    # 2. SENTENCE POSITION FEATURE - NOTE: shitty!
    sentence_position_scores = sentence_position_feature(len(sentences))

    # 3. SENTENCE LENGTH FEATURE
    sentence_length_scores = sentence_length_feature(tokenized_sentences)

    # 4. SENTENCE PARAGRAPH POSITION FEATURE

    # 5. PROPER_NOUN FEATURE
    proper_noun_scores = proper_noun_feature(tagged_sentences)

    # 6. NUMERALS FEATURE
    numerals_scores = numerals_feature(tokenized_sentences)

    # 7. NAMED ENTITIES FEATURE - very similar to PROPER_NOUN FEATURE

    # 8. TF_ISF FEATURE - NOTE: TextRank instead of TS_ISF ??? ts_isf_orig is meh
    tf_isf_scores = tf_isf_orig_feature(tokenized_sentences_without_stopwords)

    # 9. CENTROID SIMILARITY FEATURE
    centroid_similarity_scores = centroid_similarity_feature(
        sentences, tf_isf_scores)

    # 10. UPPER-CASE FEATURE (not in the paper)
    upper_case_scores = upper_case_feature(tokenized_sentences)

    # 11. QUOTES FEATURE (not in the paper)
    quotes_scores = quotes_feature(sentences)

    # 12. REFERENCES FEATURE (not in the paper)
    references_scores = references_feature(tokenized_sentences)

    # 13. TEXTRANK FEATURE (not in the paper)
    textrank_scores = textrank.textrank(tokenized_sentences, True,
                                        '4-1-0.0001')

    feature_matrix = []
    feature_matrix.append(thematicity_feature_scores)
    feature_matrix.append(sentence_position_scores)
    feature_matrix.append(sentence_length_scores)
    feature_matrix.append(proper_noun_scores)
    feature_matrix.append(numerals_scores)
    feature_matrix.append(tf_isf_scores)
    feature_matrix.append(centroid_similarity_scores)
    feature_matrix.append(upper_case_scores)

    features = [
        '  thema', 'sen_pos', 'sen_len', '  propn', '    num', ' tf_isf',
        'cen_sim', '  upper'
    ]

    feature_matrix_2 = np.zeros((len(sentences), len(features)))
    for i in range(len(features)):
        for j in range(len(sentences)):
            feature_matrix_2[j][i] = feature_matrix[i][j]

    feature_sum = []
    for i in range(len(np.sum(feature_matrix_2, axis=1))):
        feature_sum.append(np.sum(feature_matrix_2, axis=1)[i])

    print('=====Scores=====')
    print(35 * ' ', end='|')
    for f in features:
        print(f, end='|')
    print()
    for i, s in enumerate(sentences):
        print(f'#{"{:2d}".format(i + 1)}: {s[:30]}', end='|')
        for f_s in feature_matrix:
            print('{: .4f}'.format(round(f_s[i], 4)), end='|')
        print('{: .4f}'.format(round(feature_sum[i], 4)))

    print('Training rbm...')
    rbm_trained = rbm.test_rbm(dataset=feature_matrix_2,
                               learning_rate=0.1,
                               training_epochs=14,
                               batch_size=5,
                               n_chains=5,
                               n_hidden=len(features))
    # another implementation of rbm, from sklearn
    # rbm2 = BernoulliRBM(n_components=len(features), n_iter=14, batch_size=5, learning_rate=0.1)
    # rbm_trained = rbm2.fit_transform(feature_matrix_2)
    # print(rbm_trained)
    rbm_trained_sums = np.sum(rbm_trained, axis=1)

    print('=====RBM Enhanced Scores=====')
    print(35 * ' ', end='|')
    for f in features:
        print(f, end='|')
    print()
    for i, s in enumerate(sentences):
        print(f'#{"{:2d}".format(i + 1)}: {s[:30]}', end='|')
        for f_s in rbm_trained[i]:
            print('{: .4f}'.format(round(f_s, 4)), end='|')
        print('{: .4f}'.format(round(rbm_trained_sums[i], 4)))

    enhanced_feature_sum = []
    feature_sum = []

    for i in range(len(np.sum(rbm_trained, axis=1))):
        enhanced_feature_sum.append([np.sum(rbm_trained, axis=1)[i], i])
        feature_sum.append([np.sum(feature_matrix_2, axis=1)[i], i])

    print(f'enhanced_feature_sum: {enhanced_feature_sum}')
    print(f'feature_sum: {feature_sum}')

    enhanced_feature_sum.sort(key=lambda x: x[0])
    feature_sum.sort(key=lambda x: -1 * x[0])
    print('=====Sorted=====')
    print(f'enhanced_feature_sum: {enhanced_feature_sum}')
    print(f'feature_sum: {feature_sum}')

    # print('=====The text=====')
    # for x in range(len(sentences)):
    #     print(sentences[x])

    extracted_sentences_rbm = []
    extracted_sentences_rbm.append([sentences[0], 0])
    extracted_sentences_simple = []
    extracted_sentences_simple.append([sentences[0], 0])

    summary_length = max(min(round(len(sentences) / 4), 12),
                         3)  # length between 3-12 sentences
    for x in range(summary_length):
        if enhanced_feature_sum[x][1] != 0:
            extracted_sentences_rbm.append([
                sentences[enhanced_feature_sum[x][1]],
                enhanced_feature_sum[x][1]
            ])
        if feature_sum[x][1] != 0:
            extracted_sentences_simple.append(
                [sentences[feature_sum[x][1]], feature_sum[x][1]])

    extracted_sentences_rbm.sort(key=lambda x: x[1])
    extracted_sentences_simple.sort(key=lambda x: x[1])

    final_text_rbm = ''
    for i in range(len(extracted_sentences_rbm)):
        final_text_rbm += extracted_sentences_rbm[i][0] + '\n'
    final_text_simple = ''
    for i in range(len(extracted_sentences_simple)):
        final_text_simple += extracted_sentences_simple[i][0] + '\n'

    print('=====Extracted Final Text RBM=====')
    print(final_text_rbm)
    print()
    print('=====Extracted Final Text simple=====')
    print(final_text_simple)

    return final_text_rbm