def preprocess_text(text): """Takes a text, generate features, and returns as dict Args: text (str): the text to be preprocessed. Returns: dict: a dictionary of feature names with associated values """ text = _simplify_punctuation(text) features = { "flesch_reading_ease": textstat.flesch_reading_ease(text), "smog_index": textstat.smog_index(text), "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text), "coleman_liau_index": textstat.coleman_liau_index(text), "automated_readability_index": textstat.automated_readability_index(text), "dale_chall_readability_score": textstat.dale_chall_readability_score(text), "difficult_words": textstat.difficult_words(text), "linsear_write_formula": textstat.linsear_write_formula(text), "gunning_fog": textstat.gunning_fog(text), "text_standard": textstat.text_standard(text, float_output=True), "mean_parse_tree_depth": get_mean_parse_tree_depth(text), "mean_ents_per_sentence": get_mean_ents_per_sentence(text), } features.update(get_mean_pos_tags(text)) return features
def get_readability_scores(self, doc): segment = doc.text readability_dict = { "automated_readability_index": textstat.automated_readability_index(segment), "coleman_liau_index": textstat.coleman_liau_index(segment), "dale_chall_readability_score": textstat.dale_chall_readability_score(segment), "difficult_words": textstat.difficult_words(segment), "flesch_kincaid_grade": textstat.flesch_kincaid_grade(segment), "flesch_reading_ease": textstat.flesch_reading_ease(segment), "gunning_fog": textstat.gunning_fog(segment), "linsear_write_formula": textstat.linsear_write_formula(segment), "smog_index": textstat.smog_index(segment), "text_standard": self._convert_text_standard_to_integer( textstat.text_standard(segment)), } return readability_dict
def parse_comment(subreddit_name, body): # raw metrics sentences = ts.sentence_count(body) words = ts.lexicon_count(body) syllables = ts.syllable_count(body) trisyllabic = ts.trisyllab_count(body) # derived fk_grade = ts.flesch_kincaid_grade(body) smog = ts.smog_index(body) return (sub.display_name, sentences, words, syllables, trisyllabic, fk_grade, smog)
def get_score(text): scores = [] scores.append((tst.avg_sentence_length(text) - MEAN_SL) / STD_SL) scores.append((tst.avg_letter_per_word(text) - MEAN_AL) / STD_AL) scores.append(tst.avg_sentence_per_word(text)) scores.append((tst.sentence_count(text) - MEAN_SC) / STD_SC) scores.append((tst.flesch_kincaid_grade(text) - MEAN_GRADE) / MEAN_GRADE) scores.append((tst.flesch_reading_ease(text) - 50) / 50) scores.append((tst.smog_index(text) - MEAN_GRADE) / MEAN_GRADE) scores.append((tst.coleman_liau_index(text) - MEAN_GRADE) / MEAN_GRADE) scores.append((tst.automated_readability_index(text) - MEAN_GRADE) / MEAN_GRADE) scores.append((tst.dale_chall_readability_score(text) - MEAN_GRADE) / MEAN_GRADE) scores.append((tst.linsear_write_formula(text) - MEAN_GRADE) / MEAN_GRADE) scores.append((tst.gunning_fog(text) - MEAN_GRADE) / MEAN_GRADE) return scores
def post(self, args): text = args['text'] readability = {} readability["flesch_reading_ease"] = textstat.flesch_reading_ease(text) readability["flesch_kincaid_grade"] = textstat.flesch_kincaid_grade( text) readability["smog_index"] = textstat.smog_index(text) readability["coleman_liau_index"] = textstat.coleman_liau_index(text) readability[ "automated_readability_index"] = textstat.automated_readability_index( text) readability[ "dale_chall_readability_score"] = textstat.dale_chall_readability_score( text) readability["linsear_write_formula"] = textstat.linsear_write_formula( text) readability["gunning_fog"] = textstat.gunning_fog(text) readability["text_standard"] = textstat.text_standard(text) readability["difficult_words"] = textstat.difficult_words(text) return jsonify(readability)
sentences = sent_tokenize(text) words_per_sentence = [word_tokenize(sent) for sent in sentences] non_space_chars = re.sub(space_special_chars, '', text) words_len = pd.Series([len(word) for word in words]) sentences_len = pd.Series([len(sent) for sent in sentences]) len_words_per_sentence = pd.Series([len(wps) for wps in words_per_sentence]) word_stats = words_len.describe() word_stats.index = ['Word ' + i for i in word_stats.index] sent_stats = sentences_len.describe() sent_stats.index = ['Sentence ' + i for i in sent_stats.index] wps_stats = len_words_per_sentence.describe() wps_stats.index = ['Words per sentences ' + i for i in wps_stats.index] info_dict['Name'] = name info_dict['Total characters'] = len(non_space_chars) #info_dict['Total sentences'] = len(sentences) info_dict.update(word_stats.to_dict()) info_dict.update(sent_stats.to_dict()) info_dict.update(wps_stats.to_dict()) info_dict['Flesch-Kincaid'] = textstat.flesch_kincaid_grade(text) info_dict['Gunning fog'] = textstat.gunning_fog(text) info_dict['SMOG'] = textstat.smog_index(text) info_dicts.append(info_dict) df = pd.DataFrame(info_dicts) df.to_csv(r'C:\Users\Krista\DocumentsRE _Call_re_potential_matter\code_results_contents_removed.csv')
def get_delta(self): return abs( textstat.smog_index(self.input_data) - textstat.smog_index(self.output_data))
def get_score(self): self.input_data = self.input_data.replace("\n", ". ") return textstat.smog_index(self.input_data)
from textstat import textstat if __name__ == '__main__': test_data = 'The quick brown fox jumps over the lazy dog' #File to be used to test the function print(textstat.flesch_reading_ease(test_data)) print(textstat.smog_index(test_data)) print(textstat.flesch_kincaid_grade(test_data)) print(textstat.coleman_liau_index(test_data)) print(textstat.automated_readability_index(test_data)) print(textstat.dale_chall_readability_score(test_data)) print(textstat.difficult_words(test_data)) print(textstat.linsear_write_formula(test_data)) print(textstat.gunning_fog(test_data)) print(textstat.text_standard(test_data))
def index(): data = request.json print(f'Debug: {data}') unique_id = data['unique_id'] process_language = data['process_language'] message = data['message'] matches_list = None if process_language: # Language tool takes a while to process language_tool = LanguageTool('en-US') matches: list[Match] = language_tool.check(message) matches_list = [] for match in matches: matches_list.append(match_to_dict(match)) print(f'Analysis finished: {matches_list}') sentences: list = splitter.split(text=message) return { 'unique_id': unique_id, 'text_statistics': { 'lexicon_count': textstat.lexicon_count(message), 'lexicon_count_ps': list_map(sentences, textstat.lexicon_count), 'syllable_count': textstat.syllable_count(message), 'syllable_count_ps': list_map(sentences, textstat.syllable_count), 'sentences': sentences, 'sentence_count': len(sentences), 'readability': { 'flesch_reading_ease': { 'score': textstat.flesch_reading_ease(message), 'sps': list_map(sentences, textstat.flesch_reading_ease) }, 'smog_index': { 'score': textstat.smog_index(message) }, 'flesch_kincaid_grade': { 'score': textstat.flesch_kincaid_grade(message), 'sps': list_map(sentences, textstat.flesch_kincaid_grade) }, 'coleman_liau_index': { 'score': textstat.coleman_liau_index(message), 'sps': list_map(sentences, textstat.coleman_liau_index) }, 'automated_readability_index': { 'score': textstat.automated_readability_index(message), 'sps': list_map(sentences, textstat.automated_readability_index) }, 'dale_chall_readability_score': { 'score': textstat.dale_chall_readability_score(message), 'sps': list_map(sentences, textstat.dale_chall_readability_score) }, 'difficult_words': { 'score': textstat.difficult_words(message), 'sps': list_map(sentences, textstat.difficult_words), 'words': textstat.difficult_words_list(message) }, 'linsear_write_formula': { 'score': round(textstat.linsear_write_formula(message), 2), 'sps': list_map(sentences, textstat.linsear_write_formula) }, 'gunning_fog': { 'score': textstat.gunning_fog(message), 'sps': list_map(sentences, textstat.gunning_fog) }, 'text_standard': { 'score': textstat.text_standard(message) } } }, 'language_tool': matches_list }