def summarize(text): # SPLIT TO PARAGRAPHS pre_paragraphs = text.split('\n') paragraphs = [] for i, p in enumerate(pre_paragraphs): if not re.match(r'^\s*$', p) and (i == len(pre_paragraphs) - 1 or re.match(r'^\s*$', pre_paragraphs[i+1])): paragraphs.append(p) # SPLIT TO SENTENCES sentences = separator.separate(text) print(f'Num of sentences: {len(sentences)}') for i, s in enumerate(sentences): print(f'#{i+1}: {s}') # TOKENIZE stem = False if stem: tokenized_sentences = [[czech_stemmer.cz_stem(word, aggressive=True) for word in sentence] for sentence in tokenize(sentences)] else: tokenized_sentences = tokenize(sentences) # REMOVE STOPWORDS tokenized_sentences_without_stopwords = remove_stop_words(tokenized_sentences, keep_case=False) sentences_without_stopwords_case = remove_stop_words(sentences, keep_case=True, is_tokenized=False, return_tokenized=False) print('===Sentences without stopwords===') for i, s in enumerate(tokenized_sentences_without_stopwords): print(f'''#{i+1}: {' '.join(s)}''') print('===Sentences without stopwords CASE===') for i, s in enumerate(sentences_without_stopwords_case): print(f'''#{i+1}: {s}''') # POS-TAG tagged_sentences = pos_tag(sentences_without_stopwords_case) print('=====Tagged_sentences=====') for i, s in enumerate(tagged_sentences): print(f'''#{i+1}: {s}''') summary = '' counter = 0 summary_length = max(min(round(len(sentences) / 4), 15), 3) # length between 3-15 sentences ranked_sentence_indexes = textrank(tokenized_sentences_without_stopwords, stopwords=[], top_n=summary_length) print(f'ranked_sentence_indexes: {ranked_sentence_indexes}') # add 1st sentence always summary += f'{sentences[0]}\n' counter += 1 ranked_sentence_indexes.remove(0) # add also 2nd sentence if it is in top50% if 1 in ranked_sentence_indexes[:len(ranked_sentence_indexes) // 2]: summary += f'{sentences[1]}\n' counter += 1 ranked_sentence_indexes.remove(1) for sentence_index in sorted(ranked_sentence_indexes[:summary_length - counter]): if counter == summary_length: break summary += f'{sentences[sentence_index]}\n' counter += 1 return summary
def preprocess_sentences(loaded_sentences): sentences = [] #for i in range(len(loaded_sentences)): # LOAD ONLY 100 TEXTS FOR TESTING PURPOSE for i in range(len(loaded_sentences)): #for i in range(1000): splitted = loaded_sentences[i].split() preprocessed_sentence = [] for j in range(len(splitted)): formatted = to_string(splitted[j]) without_stopwords = remove_stopwords(formatted) if (without_stopwords == ''): continue # stemming can be agressive or not according to flag stemmed = stem.cz_stem(without_stopwords) preprocessed_sentence.append(stemmed) sentences.append(preprocessed_sentence) return sentences
def add_post(self, post): id = int(post[0]) text = post[3] tokens = self.tokenizer.tokenize(text) self.tf[id] = {} for t in tokens: t = t.lower() # skip short words or stopwords if len(t) < 3 or t in self.stopwords: continue t = czech_stemmer.cz_stem(t) self.tf[id][t] = self.tf[id].get(t, 0.0) + 1.0 if self.tf[id][t] == 1.0: self.corpus[t] = self.corpus.get(t, 0.0) + 1.0 for t in self.tf[id]: self.tf[id][t] /= float(len(tokens))
def preprocess_sentences(loaded_sentences, use_stopwords=True, use_stemm=False): sentences = [] for i in range(len(loaded_sentences)): splitted = loaded_sentences[i].split() preprocessed_sentence = [] for j in range(len(splitted)): formatted = to_string(splitted[j]) without_stopwords = formatted if use_stopwords: without_stopwords = remove_stopwords(formatted) if (without_stopwords == ''): continue # stemming can be agressive or not according to flag stemmed = without_stopwords if use_stemm: stemmed = stem.cz_stem(stemmed) preprocessed_sentence.append(stemmed) sentences.append(preprocessed_sentence) return sentences
def normalize_text(texts, stops): # Lower case texts = [x.lower() for x in texts] # Remove punctuation texts = [ ''.join(c for c in x if c not in string.punctuation) for x in texts ] # Remove numbers texts = [''.join(c for c in x if c not in '0123456789') for x in texts] # Remove stopwords texts = [ ' '.join([ czech_stemmer.cz_stem(word) for word in x.split() if word not in (stops) ]) for x in texts ] # Trim extra whitespace texts = [' '.join(x.split()) for x in texts] return (texts)
print('rs', metrics.adjusted_rand_score(lemmas, forms)) #print('mi', metrics.mutual_info_score(lemmas, forms), # metrics.adjusted_mutual_info_score(lemmas, forms), # metrics.normalized_mutual_info_score(lemmas, forms)) print('stem5') stem5s = list() for form in forms: stem5s.append(form[:5]) measure(lemmas, stem5s) print('czstem light') czstem = list() for form in forms: czstem.append(cz_stem(form, aggressive=False)) measure(lemmas, czstem) print('czstem aggressive') czstem = list() for form in forms: czstem.append(cz_stem(form, aggressive=True)) measure(lemmas, czstem) print('lemma is form') measure(lemmas, forms) print('gold') measure(lemmas, lemmas) import random
def summarize(text): # SPLIT TO PARAGRAPHS pre_paragraphs = text.split('\n') paragraphs = [] for i, p in enumerate(pre_paragraphs): if not re.match(r'^\s*$', p) and (i == len(pre_paragraphs) - 1 or re.match(r'^\s*$', pre_paragraphs[i + 1])): paragraphs.append(p) # print(f'Num of paragraphs: {len(paragraphs)}') # for i, p in enumerate(paragraphs): # print(f'par#{i+1}: {p}') # SPLIT TO SENTENCES sentences = separator.separate(text) print(f'Num of sentences: {len(sentences)}') for i, s in enumerate(sentences): print(f'#{i+1}: {s}') # TOKENIZE stem = False if stem: tokenized_sentences = [[ czech_stemmer.cz_stem(word, aggressive=False) for word in sentence ] for sentence in tokenize(sentences)] else: tokenized_sentences = tokenize(sentences) # REMOVE STOPWORDS tokenized_sentences_without_stopwords = remove_stop_words( tokenized_sentences, keep_case=False) sentences_without_stopwords_case = remove_stop_words( sentences, keep_case=True, is_tokenized=False, return_tokenized=False) print('===Sentences without stopwords===') for i, s in enumerate(tokenized_sentences_without_stopwords): print(f'''#{i+1}: {' '.join(s)}''') print('===Sentences without stopwords CASE===') for i, s in enumerate(sentences_without_stopwords_case): print(f'''#{i+1}: {s}''') # POS-TAG tagged_sentences = pos_tag(sentences_without_stopwords_case) print('=====Tagged_sentences=====') for i, s in enumerate(tagged_sentences): print(f'''#{i+1}: {s}''') # 1. THEMATICITY FEATURE thematicity_feature_scores = thematicity_feature( tokenized_sentences_without_stopwords) # 2. SENTENCE POSITION FEATURE - NOTE: shitty! sentence_position_scores = sentence_position_feature(len(sentences)) # 3. SENTENCE LENGTH FEATURE sentence_length_scores = sentence_length_feature(tokenized_sentences) # 4. SENTENCE PARAGRAPH POSITION FEATURE # 5. PROPER_NOUN FEATURE proper_noun_scores = proper_noun_feature(tagged_sentences) # 6. NUMERALS FEATURE numerals_scores = numerals_feature(tokenized_sentences) # 7. NAMED ENTITIES FEATURE - very similar to PROPER_NOUN FEATURE # 8. TF_ISF FEATURE - NOTE: TextRank instead of TS_ISF ??? ts_isf_orig is meh tf_isf_scores = tf_isf_orig_feature(tokenized_sentences_without_stopwords) # 9. CENTROID SIMILARITY FEATURE centroid_similarity_scores = centroid_similarity_feature( sentences, tf_isf_scores) # 10. UPPER-CASE FEATURE (not in the paper) upper_case_scores = upper_case_feature(tokenized_sentences) # 11. QUOTES FEATURE (not in the paper) quotes_scores = quotes_feature(sentences) # 12. REFERENCES FEATURE (not in the paper) references_scores = references_feature(tokenized_sentences) # 13. TEXTRANK FEATURE (not in the paper) textrank_scores = textrank.textrank(tokenized_sentences, True, '4-1-0.0001') feature_matrix = [] feature_matrix.append(thematicity_feature_scores) feature_matrix.append(sentence_position_scores) feature_matrix.append(sentence_length_scores) feature_matrix.append(proper_noun_scores) feature_matrix.append(numerals_scores) feature_matrix.append(tf_isf_scores) feature_matrix.append(centroid_similarity_scores) feature_matrix.append(upper_case_scores) features = [ ' thema', 'sen_pos', 'sen_len', ' propn', ' num', ' tf_isf', 'cen_sim', ' upper' ] feature_matrix_2 = np.zeros((len(sentences), len(features))) for i in range(len(features)): for j in range(len(sentences)): feature_matrix_2[j][i] = feature_matrix[i][j] feature_sum = [] for i in range(len(np.sum(feature_matrix_2, axis=1))): feature_sum.append(np.sum(feature_matrix_2, axis=1)[i]) print('=====Scores=====') print(35 * ' ', end='|') for f in features: print(f, end='|') print() for i, s in enumerate(sentences): print(f'#{"{:2d}".format(i + 1)}: {s[:30]}', end='|') for f_s in feature_matrix: print('{: .4f}'.format(round(f_s[i], 4)), end='|') print('{: .4f}'.format(round(feature_sum[i], 4))) print('Training rbm...') rbm_trained = rbm.test_rbm(dataset=feature_matrix_2, learning_rate=0.1, training_epochs=14, batch_size=5, n_chains=5, n_hidden=len(features)) # another implementation of rbm, from sklearn # rbm2 = BernoulliRBM(n_components=len(features), n_iter=14, batch_size=5, learning_rate=0.1) # rbm_trained = rbm2.fit_transform(feature_matrix_2) # print(rbm_trained) rbm_trained_sums = np.sum(rbm_trained, axis=1) print('=====RBM Enhanced Scores=====') print(35 * ' ', end='|') for f in features: print(f, end='|') print() for i, s in enumerate(sentences): print(f'#{"{:2d}".format(i + 1)}: {s[:30]}', end='|') for f_s in rbm_trained[i]: print('{: .4f}'.format(round(f_s, 4)), end='|') print('{: .4f}'.format(round(rbm_trained_sums[i], 4))) enhanced_feature_sum = [] feature_sum = [] for i in range(len(np.sum(rbm_trained, axis=1))): enhanced_feature_sum.append([np.sum(rbm_trained, axis=1)[i], i]) feature_sum.append([np.sum(feature_matrix_2, axis=1)[i], i]) print(f'enhanced_feature_sum: {enhanced_feature_sum}') print(f'feature_sum: {feature_sum}') enhanced_feature_sum.sort(key=lambda x: x[0]) feature_sum.sort(key=lambda x: -1 * x[0]) print('=====Sorted=====') print(f'enhanced_feature_sum: {enhanced_feature_sum}') print(f'feature_sum: {feature_sum}') # print('=====The text=====') # for x in range(len(sentences)): # print(sentences[x]) extracted_sentences_rbm = [] extracted_sentences_rbm.append([sentences[0], 0]) extracted_sentences_simple = [] extracted_sentences_simple.append([sentences[0], 0]) summary_length = max(min(round(len(sentences) / 4), 12), 3) # length between 3-12 sentences for x in range(summary_length): if enhanced_feature_sum[x][1] != 0: extracted_sentences_rbm.append([ sentences[enhanced_feature_sum[x][1]], enhanced_feature_sum[x][1] ]) if feature_sum[x][1] != 0: extracted_sentences_simple.append( [sentences[feature_sum[x][1]], feature_sum[x][1]]) extracted_sentences_rbm.sort(key=lambda x: x[1]) extracted_sentences_simple.sort(key=lambda x: x[1]) final_text_rbm = '' for i in range(len(extracted_sentences_rbm)): final_text_rbm += extracted_sentences_rbm[i][0] + '\n' final_text_simple = '' for i in range(len(extracted_sentences_simple)): final_text_simple += extracted_sentences_simple[i][0] + '\n' print('=====Extracted Final Text RBM=====') print(final_text_rbm) print() print('=====Extracted Final Text simple=====') print(final_text_simple) return final_text_rbm