def ts(): text = """ Mr. Speaker, 480,000 Federal employees are working without pay, a form of involuntary servitude; 280,000 Federal employees are not working, and they will be paid. Virtually all of these workers have mortgages to pay, children to feed, and financial obligations to meet. Mr. Speaker, what is happening to these workers is immoral, is wrong, and must be rectified immediately. Newt Gingrich and the Republican leadership must not continue to hold the House and the American people hostage while they push their disastrous 7-year balanced budget plan. The gentleman from Georgia, Mr. Gingrich, and the Republican leadership must join Senator Dole and the entire Senate and pass a continuing resolution now, now to reopen Government. Mr. Speaker, that is what the American people want, that is what they need, and that is what this body must do. """ doc = Doc(text.strip(), lang="en") ts_ = text_stats.TextStats(doc) return ts_
def setUp(self): self.doc = Doc(TEXT, lang='en') self.ts = text_stats.TextStats(self.doc)
def ts(): doc = Doc(TEXT, lang='en') ts_ = text_stats.TextStats(doc) return ts_
def setUp(self): self.doc = Doc(TEXT, lang='en_core_web_sm') self.ts = text_stats.TextStats(self.doc)
def score_summary_2(summary_text): """Score a summarized piece of text """ # Want high similarity between paragraphs inter_paragraph_similarities = [] avg_similarity = None sentences = [i.text for i in NLP(summary_text).sents] # readability measures close to ebert baseline readability = abs( text_stats.TextStats(NLP(summary_text)).automated_readability_index - EBERT_READABILITY) / EBERT_READABILITY # Coh Metrix Indices anaphor_score = anaphor_overlap(summary_text) person_score = person_overlap(summary_text) # more subjective is better total_subjectivity = 0 for i in sentences: total_subjectivity += TextBlob(i).sentiment[1] subjectivity = total_subjectivity / len(sentences) # thesis sentence doesn't have "this", "here", "it" if sentences[0] not in [' ', '', '\n']: thesis_penalty = sum(i in sentences[0] for i in [" this ", " This ", " here ", " Here"]) elif sentences[1] not in [' ', '', '\n']: thesis_penalty = sum(i in sentences[1] for i in [" this ", " This ", " here ", " Here"]) else: thesis_penalty = 0 # Prefer expressions from the author author_count = 0 for s in sentences: if any(i in s for i in ["I ", "I'd", "My"]): author_count += 1 # iterate through the paragraphs # sentiment within a paragraph is similar paragraphs = summary_text.split('\n') for i in range(1, len(paragraphs)): if paragraphs[i - 1] not in [' ', '', '\n'] and paragraphs[i] not in [ ' ', '', '\n' ]: inter_paragraph_similarities.append( similarity.word_movers(NLP(paragraphs[i - 1]), NLP(paragraphs[i]))) max_diff = 0 for p in paragraphs: p_sent_min = None p_sent_max = None for s in p.split('.'): sent = TextBlob(s).sentiment[0] if p_sent_min is None: p_sent_min = sent if p_sent_max is None: p_sent_max = sent if sent < p_sent_min: p_sent_min = sent if sent > p_sent_max: p_sent_max = sent if max_diff < abs(p_sent_max - p_sent_min): max_diff = abs(p_sent_max - p_sent_min) max_diff = 1 - max_diff avg_similarity = sum(inter_paragraph_similarities) / len( inter_paragraph_similarities) # Make score score = (0.25 * avg_similarity) + \ (0.20 * person_score) + \ (0.15 * anaphor_score) + \ (0.1 * max_diff) + \ (0.05 * readability) + \ (0.25 * subjectivity) # boost by person count score = score * (1 + (0.1 * author_count)) score = score - (0.2 * thesis_penalty) return score
def ari(doc): doc_stats = ts.TextStats(doc) return (0.4 * doc_stats.n_words / doc_stats.n_sents) + ( 6 * doc_stats.n_chars / doc_stats.n_words) - 27.4
def flesch(doc): doc_stats = ts.TextStats(doc) return 206.835 - (1.015 * doc_stats.n_words / doc_stats.n_sents) - ( 84.6 * doc_stats.n_syllables / doc_stats.n_words)