def lex_readability(self, text, mode='fre'): if mode == 'all': fre_score = textstat.flesch_reading_ease(text) fog_index = textstat.gunning_fog(text) fkg_index = textstat.flesch_kincaid_grade(text) dcr_score = textstat.dale_chall_readability_score(text) text_standard = textstat.text_standard(text, float_output=True) return fre_score, fog_index, fkg_index, dcr_score, text_standard if mode == 'fre': fre_score = textstat.flesch_reading_ease(text) return fre_score if mode == 'fog': fog_index = textstat.gunning_fog(text) return fog_index if mode == 'fkg': fkg_index = textstat.flesch_kincaid_grade(text) return fkg_index if mode == 'dcr': dcr_score = textstat.dale_chall_readability_score(text) return dcr_score if mode == 'text_std': text_standard = textstat.text_standard(text, float_output=True) return text_standard
def check_difficulty(self): text = self.textoutput #0-30 = college #50-60 = high school #60+ = middle school/elementary school try: grade_level = textstat.text_standard(text) reading_ease = textstat.flesch_reading_ease(text) #requires chart sentence_count = textstat.sentence_count(text) difficult_words = self.get_difficult_words(text) replacement_words = self.get_replacement_words(difficult_words) output = "Grade Level of Input Text: " + grade_level + "\n" #output = output + "Ease of Reading*: " + str(reading_ease) + "\n" output = output + "Sentence Count: " + str(sentence_count) + "\n" output = output + "Difficult Words Found: " + str( len(difficult_words)) + "\n" output = output + "Possible Replacements: " + "\n" for dw in replacement_words: output = output + dw + " -> " for word in replacement_words[dw]: output = output + word + ", " output = output + "\n" self.difficultyReport = output except: self.difficultyReport = "Error determining Difficulties"
def getReadabilityMetrics(test_data): ''' for a given article IN TEXT FORMAT, returns its readability metrics Uses textstat library, please install it ''' metric = { "flesch_reading_ease": textstat.flesch_reading_ease(test_data), "smog_index": textstat.smog_index(test_data), "flesch_kincaid_grade": textstat.flesch_kincaid_grade(test_data), "coleman_liau_index": textstat.coleman_liau_index(test_data), "automated_readability_index": textstat.automated_readability_index(test_data), "dale_chall_readability_score": textstat.dale_chall_readability_score(test_data), "difficult_words": textstat.difficult_words(test_data), "linsear_write_formula": textstat.linsear_write_formula(test_data), "gunning_fog": textstat.gunning_fog(test_data), "text_standard": textstat.text_standard(test_data) } return metric
def readability(text): """ Provides the readability grade for the text. Here we are using the flesch reading ease score. Higher the score, easier to read text: input text on which score has to be calculated """ score = textstat.flesch_reading_ease(text) grade = round(textstat.flesch_kincaid_grade(text)) if score > 90: summary = "Very easy to read. Easily understood by an average 11-year-old student; " elif score > 80: summary = "Easy to read. Conversational English for consumers" elif score > 70: summary = "Fairly easy to read" elif score > 60: summary = "Plain English. Easily understood by 13- to 15-year-old students." elif score > 50: summary = "Fairly difficult to read." elif score > 30: summary = "Difficult to read" else: summary = "Very difficult to read. Best understood by university graduates." return score, summary, grade
def analyze(): print(request) str_to_read = request.data.decode("utf-8").strip() report = { "flesch-reading-ease": textstat.flesch_reading_ease(str_to_read), "smog-index": textstat.smog_index(str_to_read), "flesch-kincaid-grade": textstat.flesch_kincaid_grade(str_to_read), "coleman-liau-index": textstat.coleman_liau_index(str_to_read), "automated-readability-index": textstat.automated_readability_index(str_to_read), "dale-chall-readability-score": textstat.dale_chall_readability_score(str_to_read), "difficult-words": textstat.difficult_words(str_to_read), "linsear-write-formula": textstat.linsear_write_formula(str_to_read), "gunning-fog": textstat.gunning_fog(str_to_read), "text-standard": textstat.text_standard(str_to_read) } return decorate_response(jsonify(report))
def seven_test(processed_essay): """ score which is assigned to every script in on the basis of some predifened fomulas These scores are known as readability score. flesch_score,gunning_index,kincaid_grade,liau_index,automated_readability_index,dale_readability_score,difficult_word,linsear_write :param processed_essay: :return:flesch_score,gunning_index,kincaid_grade,liau_index,automated_readability_index,dale_readability_score,difficult_word,linsear_write """ flesch_score = ["FS"] gunning_index = ["GI"] kincaid_grade = ["KG"] liau_index = ["LI"] automated_readability_index = ["ARI"] dale_readability_score = ["DLS"] difficult_word = ["DW"] linsear_write = ["LW"] for v in processed_essay: flesch_score.append(textstat.flesch_reading_ease(str(v))) gunning_index.append(textstat.gunning_fog(str(v))) kincaid_grade.append(textstat.flesch_kincaid_grade(str(v))) liau_index.append(textstat.coleman_liau_index(str(v))) automated_readability_index.append(textstat.automated_readability_index(str(v))) dale_readability_score.append(textstat.dale_chall_readability_score(str(v))) difficult_word.append(textstat.difficult_words(str(v))) linsear_write.append(textstat.linsear_write_formula(str(v))) return flesch_score,gunning_index,kincaid_grade,liau_index,automated_readability_index,dale_readability_score,difficult_word,linsear_write
def textstat_stats(text): doc_length = len(text.split()) flesch_ease = ts.flesch_reading_ease(text) #Flesch Reading Ease Score flesch_grade = ts.flesch_kincaid_grade(text) #Flesch-Kincaid Grade Level gfog = ts.gunning_fog(text) # FOG index, also indicates grade level # smog = ts.smog_index(text) # SMOG index, also indicates grade level, only useful on 30+ sentences auto_readability = ts.automated_readability_index(text) #approximates the grade level needed to comprehend the text. cl_index = ts.coleman_liau_index(text) #grade level of the text using the Coleman-Liau Formula. lw_formula = ts.linsear_write_formula(text) #grade level using the Linsear Write Formula. dcr_score = ts.dale_chall_readability_score(text) #uses a lookup table of the most commonly used 3000 English words # text_standard = ts.text_standard(text, float_output=False) # summary of all the grade level functions syll_count = ts.syllable_count(text, lang='en_US') syll_count_scaled = syll_count / doc_length lex_count = ts.lexicon_count(text, removepunct=True) lex_count_scaled = lex_count / doc_length idx = ['flesch_ease', 'flesch_grade','gfog', 'auto_readability','cl_index','lw_formula', 'dcr_score', # 'text_standard', 'syll_count', 'lex_count'] return pd.Series([flesch_ease, flesch_grade, gfog, auto_readability, cl_index, lw_formula, dcr_score, # text_standard, syll_count_scaled, lex_count_scaled], index = idx)
def get_stats(text): fre = textstat.flesch_reading_ease(text) smog = textstat.smog_index(text) fkg = textstat.flesch_kincaid_grade(text) cli = textstat.coleman_liau_index(text) ari = textstat.automated_readability_index(text) dcr = textstat.dale_chall_readability_score(text) diff_words = textstat.difficult_words(text) lwf = textstat.linsear_write_formula(text) gunn_fog = textstat.gunning_fog(text) consolidated_score = textstat.text_standard(text) doc_length = len(text) # think about excluding spaces? quote_count = text.count('"') stats = { "flesch_reading_ease": fre, "smog_index": smog, "flesch_kincaid_grade": fkg, "coleman_liau_index": cli, "automated_readability_index": ari, "dale_chall_readability_score": dcr, "difficult_words": diff_words, "linsear_write_formula": lwf, "gunning_fog": gunn_fog, "consolidated_score": consolidated_score, "doc_length": doc_length, "quote_count": quote_count } return stats
def feature_getter(text): try: text=text.decode('utf-8') except: pass text1=re.sub(r'[^\x00-\x7F]+',' ', text) ##text1=re.sub('\n','. ', text) text=text1 features=[] tokens=[] sentences = nltk.sent_tokenize(text) [tokens.extend(nltk.word_tokenize(sentence)) for sentence in sentences] syllable_count = textstat.syllable_count(text, lang='en_US') word_count = textstat.lexicon_count(text, removepunct=True) flesch = textstat.flesch_reading_ease(text) readability = textstat.automated_readability_index(text) features.append(len(sentences)) #num_sentences features.append(syllable_count) #num_sentences features.append(word_count) #num_sentences features.append(flesch) #num_sentences features.append(readability) #num_sentences return features
def score(full): st.header(textstat.flesch_reading_ease(full)) st.write('Flesch Reading Ease Score') text = """90-100 Very Easy,70-79 Fairly Easy,60-69 Standard,50-59Fairly Difficult,30-49 Difficult,0-29 Very Confusing """ st.write(text, key=1) st.header(textstat.smog_index(full)) st.write('Smog Index Score') text = "Returns the SMOG index of the given text.This is a grade formula in that a score of 9.3 means that a ninth " \ "grader would be able to read the document.Texts of fewer than 30 sentences are statistically invalid, " \ "because the SMOG formula was normed on 30-sentence samples. textstat requires at least 3 sentences for a " \ "result. " st.write(text, key=2) st.header(textstat.dale_chall_readability_score(full)) st.write('Dale Chall Readability Score') text = """Different from other tests, since it uses a lookup table of the most commonly used 3000 English words. Thus it returns the grade level using the New Dale-Chall Formula. 4.9 or lower average 4th-grade student or lower 5.0–5.9 average 5th or 6th-grade student 6.0–6.9 average 7th or 8th-grade student 7.0–7.9 average 9th or 10th-grade student 8.0–8.9 average 11th or 12th-grade student 9.0–9.9 average 13th to 15th-grade (college) student""" st.write(text, key=3)
def terms_and_weights(sample): sentences = list() file_path = f"data/Job Bulletins/{sample}" with open(file_path) as file: reading_score = textstat.flesch_reading_ease(file_path) reading_score_2 = textstat.dale_chall_readability_score(file_path) for line in file: for l in re.split(r"\.\s|\?\s|\!\s|\n", line): if l: sentences.append(l) cvec = CountVectorizer(stop_words='english', min_df=3, max_df=0.5, ngram_range=(1, 2)) sf = cvec.fit_transform(sentences) transformer = TfidfTransformer() transformed_weights = transformer.fit_transform(sf) weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist() weights_df = pd.DataFrame({ 'term': cvec.get_feature_names(), 'weight': weights }) weights_df = weights_df.sort_values(by='weight', ascending=False).head(10) myList = { "term": weights_df.term.tolist(), "weight": weights_df.weight.tolist(), "scores": [reading_score, reading_score_2] } file.close() return jsonify(myList)
def extractFRSAllHTMLFiles(): """ Extract Fleisch Reading Scores (FRS). The documents and indexes are read again as the FRS is given for the original text and not the processed text """ Path1 = 'Gutenberg_English_Fiction_1k' Path2 = 'Gutenberg_English_Fiction_1k' HTMLFilesPath = 'Gutenberg_19th_century_English_Fiction' FRSScores = [] badIndexes = [] dataPath = os.path.join(os.getcwd(),Path1,Path2, HTMLFilesPath) data = pp.readIndexes() for i in range(len(data)): print(i) htmlFilePath = os.path.join(dataPath,data['book_id'][i])[:-5] + '-content.html' corpus = pp.readHTMLFile(htmlFilePath) if corpus: score = textstat.flesch_reading_ease(corpus) FRSScores.append(score) else: badIndexes.append(i) with open(FRSFile, 'wb') as f: pickle.dump(FRSScores,f)
def fleschscore() -> List: """return flesch reading ease score """ flesch_list = [] for text in policies['Policy']: flesch_list.append(textstat.flesch_reading_ease(text)) return flesch_list
def readability(queries): scores = pd.DataFrame(columns=[ 'Flesch', 'Smog', 'Flesch grade', 'Coleman', 'Automated', 'Dale', 'Difficult', 'Linsear', 'Gunning', 'Text Standard' ]) scores = { 'Flesch': [], 'Smog': [], 'Flesch grade': [], 'Coleman': [], 'Automated': [], 'Dale': [], 'Difficult': [], 'Linsear': [], 'Gunning': [], 'Text Standard': [] } for line in queries: # results = readability.getmeasures(line, lang='en') # frescores.append(results['readability grades']['FleschReadingEase']) # line = 'yao family wines . yao family wines is a napa valley producer founded in 2011 by yao ming , the chinese-born , five-time nba all star . now retired from the houston rockets , yao ming is the majority owner in yao family wines , which has entered the wine market with a luxury cabernet sauvignon sourced from napa valley vineyards .' scores['Flesch'].append(textstat.flesch_reading_ease(line)) scores['Smog'].append(textstat.smog_index(line)) scores['Flesch grade'].append(textstat.flesch_kincaid_grade(line)) scores['Coleman'].append(textstat.coleman_liau_index(line)) scores['Automated'].append(textstat.automated_readability_index(line)) scores['Dale'].append(textstat.dale_chall_readability_score(line)) scores['Difficult'].append(textstat.difficult_words(line)) scores['Linsear'].append(textstat.linsear_write_formula(line)) scores['Gunning'].append(textstat.gunning_fog(line)) scores['Text Standard'].append( textstat.text_standard(line, float_output=True)) return scores
def get_readibility(text, metric="flesch_kincaid_grade"): """ Return a score which reveals a piece of text's readability level. Reference: https://chartbeat-labs.github.io/textacy/getting_started/quickstart.html https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests """ if metric == "flesch_kincaid_grade": result = textstat.flesch_kincaid_grade(text) elif metric == "flesch_reading_ease": result = textstat.flesch_reading_ease(text) elif metric == "smog_index": result = textstat.smog_index(text) elif metric == "coleman_liau_index": result = textstat.coleman_liau_index(text) elif metric == "automated_readability_index": result = textstat.automated_readability_index(text) elif metric == "dale_chall_readability_score": result = textstat.dale_chall_readability_score(text) elif metric == "difficult_words": result = textstat.difficult_words(text) elif metric == "linsear_write_formula": result = textstat.linsear_write_formula(text) elif metric == "gunning_fog": result = textstat.gunning_fog(text) elif metric == "text_standard": result = textstat.text_standard(text) else: print("ERROR: Please select correct metric!") result = None return result
def test_changing_lang_clears_cache(): textstat.set_lang("en_US") # Clear any cache and call reading ease textstat.flesch_reading_ease.cache_clear() textstat.flesch_reading_ease(short_test) # Check the cache has only been missed once assert textstat.flesch_reading_ease.cache_info().misses == 1 # Change the language and recall reading ease textstat.set_lang("fr") textstat.flesch_reading_ease(short_test) # Check the cache hasn't been hit again assert textstat.flesch_reading_ease.cache_info().misses == 1
def addFleschReadingScore(self): 'do something here' score = [] for each in self.data['content'].iteritems(): each = self.removeHTMLTags(each[1]) score.append(textstat.flesch_reading_ease(each)) self.data['Flesch_Score'] = score
def get_readability_score(text, metric="flesch"): global tknzr, DIFFICULT text = text.replace("’", "'") # https://pypi.org/project/textstat/ if metric == "flesch": return textstat.flesch_reading_ease(text) elif metric == "smog": return textstat.smog_index(text) elif metric == "coleman_liau_index": return textstat.coleman_liau_index(text) elif metric == "automated_readability_index": return textstat.automated_readability_index(text) elif metric == "dale_chall_readability_score": return textstat.dale_chall_readability_score(text) elif metric == "difficult_words": nb_difficult = 0 nb_easy = 0 for w in set(tknzr.tokenize(text.lower())): if w not in EASY_WORDS and len(w) >= 6: nb_difficult += 1 else: nb_easy += 1 return 100 * nb_difficult / (nb_difficult + nb_easy) #return textstat.difficult_words(text)#/len(text.split()) elif metric == "linsear_write_formula": return textstat.linsear_write_formula(text) elif metric == "gunning_fog": return textstat.gunning_fog(text) elif metric == "avg_word_length": words = tknzr.tokenize(text) words = [w for w in words if w not in misc_utils.PUNCT] if len(words) == 0: return 0 return np.average([len(w) for w in words])
def get_under_score_text(self , score): """Too low score means the complex words being used in these sentence""" reading = self.content.\ apply(lambda x : flesch_reading_ease(x)) x=[i for i in range(len(reading)) if reading[i]<score] # displacy(self.content.iloc[x].head()) return self.content.iloc[x]
def main(): df = pd.DataFrame(columns=['Utility', 'FK Score', 'FK Grade Level']) for x in glob.glob('pdfs/*.pdf'): try: text = parser.from_file(x) df = df.append( { 'Utility': str(x).split('\\')[1].split('.')[0], 'FK Score': textstat.flesch_reading_ease(text['content']), 'FK Grade Level': textstat.flesch_kincaid_grade(text['content']) }, ignore_index=True) except: df = df.append( { 'Utility': str(x).split('\\')[1].split('.')[0], 'FK Score': 'N/A', 'FK Grade Level': 'N/A' }, ignore_index=True) df.to_csv('data/results/readability_results.csv', encoding='utf-8')
def complexityFunction(text, level): words = [] words = text.split() selectedWords = [] for i in words: if (textstat.flesch_reading_ease(i) <= level): selectedWords.append(i) return selectedWords
def sentence_by_sentence_analysis(cleanedtext): blob = TextBlob(cleanedtext) split_text=blob.sentences df=pd.DataFrame((''.join(split_text[i]) for i in range(len(split_text))),columns=['Sentences']) df["Sentence Word Count"]= pd.DataFrame(len(df["Sentences"][i].split()) for i in range(len(df))) df["FS_GradeScore"] = pd.DataFrame((textstat.flesch_reading_ease(df["Sentences"][i]) for i in range(len(df)))) df[["TextBlob_Polarity","TextBlob_Subjectivity"]]=round(pd.DataFrame((split_text[i].sentiment for i in range(len(split_text))))*100,1) return df
def score(text): a = textstat.flesch_reading_ease(text) b = textstat.flesch_kincaid_grade(text) c = textstat.gunning_fog(text) d = textstat.smog_index(text) e = textstat.coleman_liau_index(text) f = textstat.automated_readability_index(text) return a, b, c, d, e, f
def test_flesch_reading_ease(): textstat.set_lang("en_US") score = textstat.flesch_reading_ease(long_test) assert score == 64.75 textstat.set_lang("de_DE") score = textstat.flesch_reading_ease(long_test) assert score == 63.1 textstat.set_lang("es_ES") score = textstat.flesch_reading_ease(long_test) assert score == 84.37 textstat.set_lang("fr_FR") score = textstat.flesch_reading_ease(long_test) assert score == 80.31 textstat.set_lang("it_IT") score = textstat.flesch_reading_ease(long_test) assert score == 89.27 textstat.set_lang("nl_NL") score = textstat.flesch_reading_ease(long_test) assert score == 61.97 textstat.set_lang("ru_RU") score = textstat.flesch_reading_ease(long_test) assert score == 116.45
def reading_level(): if request.method != 'POST': print('Not a post request') return 'NOTAVALIDPATH:' sentence = request.get_json()['sentence'] level = textstat.flesch_reading_ease(sentence) return json.dumps({'level': level})
def reducer(self, user, reviews): OUTPUT_PROTOCOL = CsvProtocol reviews = ' '.join(reviews) metric = textstat.flesch_reading_ease(reviews) writed = user + ', ' + str(metric) + '\n' data.write(writed) yield user, metric
def get_readability(text): # takes in a string sentece and returns a int of rating # print(text, '\n') # returns a score 120 to negative infinity. Higher scores are easier to read rating = textstat.flesch_reading_ease(text) # print(rating, '\n') # returns grade level to comprehend the text, many different methods varify results # print(textstat.text_standard(text, float_output=False), '\n \n') return rating
def extractor(path_to_file): book_data = html_parser.extract_text(path_to_file) blob = TextBlob(book_data) sentiment_start, sentiment_end = start_end_sentiment(blob) sentences_count, avg_sentence_len, word_count, proper_noun_count = book_structure(blob, path_to_file) flesch_score = textstat.flesch_reading_ease(book_data) return sentiment_start, sentiment_end, sentences_count, avg_sentence_len, flesch_score ,word_count, proper_noun_count
def do_datas(): # logging.info('do_datas') ########### Save text statistics ##### 1. nw 2. nvocab 3. nsyllable 4.nsentence 5. tone 6. readability ## 1. nw nw.append(len(words)) ## 2. nvocab nvocab.append(len(vocab)) ## 3. syllable n = textstat.syllable_count(contents) nsyllable.append(n) ## 4. sentence n = textstat.sentence_count(contents) nsentence.append(n) ## 5. tone ### LM dictionary n_neg_lm.append(count_occurrence(words, lm_neg)) n_pos_lm.append(count_occurrence(words, lm_pos)) n_uctt_lm.append(count_occurrence(words, lm_uctt)) n_lit_lm.append(count_occurrence(words, lm_lit)) n_cstr_lm.append(count_occurrence(words, lm_cstr)) n_modal1_lm.append(count_occurrence(words, lm_modal1)) n_modal2_lm.append(count_occurrence(words, lm_modal2)) n_modal3_lm.append(count_occurrence(words, lm_modal3)) n_negation_lm.append(count_negation(words, lm_pos, gt_negation)) ### General Inquirer dictionary n_neg_gi.append(count_occurrence(words, gi_neg)) n_pos_gi.append(count_occurrence(words, gi_pos)) n_negation_gi.append(count_negation(words, gi_pos, gt_negation)) ### Henry dictionary n_neg_hr.append(count_occurrence(words, hr_neg)) n_pos_hr.append(count_occurrence(words, hr_pos)) n_negation_hr.append(count_negation(words, gi_pos, gt_negation)) ## 4. readability fre_i = textstat.flesch_reading_ease(contents) if fre_i > 100: fre_i = 100 if fre_i < 0: fre_i = float('NaN') fre.append(fre_i) fkg_i = textstat.flesch_kincaid_grade(contents) if fkg_i < 0: fkg_i = float('NaN') fkg.append(fkg_i) # RIX cl_i = textstat.coleman_liau_index(contents) if cl_i < 0: cl_i = float('NaN') cl.append(cl_i) f = textstat.gunning_fog(contents) fog.append(f) f = textstat.automated_readability_index(contents) ari.append(f) f = textstat.smog_index(contents) smog.append(f)
def doc_calc(self, article): """Helper code to compute average word length of a name""" flesch_ease = textstat.flesch_reading_ease(article) flesch_grade = textstat.flesch_kincaid_grade(article) gunning = textstat.gunning_fog(article) profanity = predict_prob([article])[0] polarity = TextBlob(article).sentiment.polarity return pd.Series( [flesch_ease, flesch_grade, gunning, profanity, polarity])
def test_flesch_reading_ease(): score = textstat.flesch_reading_ease(long_test) assert score == 64.75
import openpyxl wb = openpyxl.load_workbook('testing.xlsx') ws = wb.get_sheet_by_name('testing_set') import textstat from textstat.textstat import textstat for i in range(2, 591): score = 0 f_essay = ws.cell(row = i, column = 3) essay = f_essay.value score = textstat.flesch_reading_ease(essay) ws.cell(row = i, column = 11).value = score wb.save('testing.xlsx')
def getReadabilityScore(title): print textstat.flesch_reading_ease(title[0])