def readability_scores(self, text): self.ari = textstat.automated_readability_index(text) self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) self.coleman_liau_index = textstat.coleman_liau_index(text) self.dale_chall_readability_score = textstat.dale_chall_readability_score( text) self.flesch_reading_ease = textstat.flesch_reading_ease(text) self.gunning_fog = textstat.gunning_fog(text) self.linsear_write_formula = textstat.linsear_write_formula(text) self.lix = textstat.lix(text) self.rix = textstat.rix(text) self.smog_index = textstat.smog_index(text) self.text_standard = textstat.text_standard(text)
def test_lix(self): score = textstat.lix(self.long_test) self.assertEqual(45.11, score)
def test_lix(): textstat.set_lang("en_US") score = textstat.lix(long_test) assert score == 45.11
def get_redability_assessments(data_text: str) -> Optional[dict]: divided_text = tokenize.sent_tokenize(data_text) word_tokenizes = nltk.word_tokenize(data_text) pos_tags = nltk.pos_tag(word_tokenizes) pos_tags_tagger = TAGGER.tag(word_tokenizes) f_dist = nltk.FreqDist(word_tokenizes) uniqueWordCount = compute_unique_word_count(f_dist.most_common()) paragraphCount = max(len(data_text.split('\n')), len(data_text.split('\r\n'))) counts = Counter(tag for word, tag in pos_tags) # Readability Grade Levels readability_grade_levels = dict(fleschKincaid=0, gunningFog=0, colemanLiau=0, smog=0, ari=0, forecastGradeLevel=0, powersSumnerKearlGrade=0, rix=0, raygorReadability=0, fryReadability=0, flesch=0) readability_grade_levels.update(fleschKincaid=textstat.flesch_kincaid_grade(data_text)) readability_grade_levels.update(gunningFog=textstat.gunning_fog(data_text)) readability_grade_levels.update(colemanLiau=textstat.coleman_liau_index(data_text)) readability_grade_levels.update(smog=textstat.smog_index(data_text)) readability_grade_levels.update(ari=textstat.automated_readability_index(data_text)) readability_grade_levels.update(rix=textstat.rix(data_text)) # need to check readability_grade_levels.update(forcastGradeLevel=round(20 - (textstat.avg_syllables_per_word(data_text) / 10), 2)) readability_grade_levels.update(powersSumnerKearlGrade=round(textstat.avg_sentence_length(data_text) + textstat.avg_syllables_per_word(data_text) + 2.7971, 2)) readability_grade_levels.update(raygorReadability=count_raygor_readability(divided_text)) readability_grade_levels.update(fryReadability=count_fry_readability(divided_text)) # need to check readability_grade_levels.update(flesch=textstat.flesch_reading_ease(data_text)) # Readability Scores readability_scores = dict(readableRating="", fleschReadingEase=0, cefrLevel='', ieltsLevel='', spacheScore=0, newDaleChallScore=0, lixReadability=0, lensearWrite=0) readability_scores.update(readableRating=count_average_grade_levels(readability_grade_levels)) readability_scores.update(fleschReadingEase=textstat.flesch_reading_ease(data_text)) readability_scores.update(cefrLevel=count_cefr_levels(readability_grade_levels)) readability_scores.update(ieltsLevel=count_ielts_levels(readability_grade_levels)) readability_scores.update(spacheScore=round(textstat.spache_readability(data_text), 2)) readability_scores.update(newDaleChallScore=textstat.dale_chall_readability_score_v2(data_text)) readability_scores.update(lixReadability=textstat.lix(data_text)) readability_scores.update(lensearWrite=textstat.linsear_write_formula(data_text)) # Text Statistics text_statistics = dict(characterCount=0, syllableCount=0, wordCount=0, uniqueWordCount=0, sentenceCount=0, paragraphCount=0) text_statistics.update(characterCount=textstat.char_count(data_text)) text_statistics.update(syllableCount=textstat.syllable_count(data_text)) text_statistics.update(wordCount=textstat.lexicon_count(data_text)) text_statistics.update(uniqueWordCount=uniqueWordCount) text_statistics.update(sentenceCount=textstat.sentence_count(data_text)) text_statistics.update(paragraphCount=paragraphCount) # Timings timings_statistics = dict(readingTime=0, speakingTime=0) timings_statistics.update(readingTime=reading_time(textstat.lexicon_count(data_text))) timings_statistics.update(speakingTime=speaking_time(textstat.lexicon_count(data_text))) # Text Composition text_composition = dict(adjectives=0, adverbs=0, conjunctions=0, determiners=0, interjections=0, nouns=0, verbs=0, properNouns=0, prepositions=0, pronouns=0, qualifiers=0, unrecognised=0, nonWords=0) text_composition.update(adjectives=counts.get('JJ', 0) + counts.get('JJR', 0) + counts.get('JJS', 0)) text_composition.update(adverbs=counts.get('RB', 0) + counts.get('RBR', 0) + counts.get('RBS', 0)) text_composition.update(conjunctions=counts.get('CC', 0)) text_composition.update(determiners=counts.get('DT', 0) + counts.get('PDT', 0) + counts.get('WDT', 0)) text_composition.update(interjections=counts.get('UH', 0)) text_composition.update(nouns=counts.get('NN', 0) + counts.get('NNS', 0)) text_composition.update( verbs=counts.get('VB', 0) + counts.get('VBD', 0) + counts.get('VBG', 0) + counts.get('VBN', 0) + counts.get( 'VBP', 0) + counts.get('VBZ', 0)) text_composition.update(properNouns=counts.get('NNP', 0) + counts.get('NNPS', 0)) text_composition.update(prepositions=counts.get('IN', 0)) text_composition.update( pronouns=counts.get('PRP', 0) + counts.get('PRP$', 0) + counts.get('WP', 0) + counts.get('WP$', 0)) text_composition.update(qualifiers=counts.get('RB', 0)) text_composition.update(unrecognised=counts.get(None, 0)) text_composition.update(nonWords=counts.get('.', 0) + counts.get(',', 0) + counts.get(':', 0)) # Readability Issues text_readability_issues = dict(sentences30SyllablesCount=0, sentences20SyllablesCount=0, sentences30Syllables=[], sentences20Syllables=[], words4SyllablesCount=0, words12LettersCount=0, words4Syllables=[], words12Letters=[]) sentences_30_syllables, sentences_30_count, sentences_20_syllables, sentences_20_count = count_sentences_syllables( divided_text) sentences_30_syllables = find_limit_offcet(data_text, sentences_30_syllables, "sentences_30_syllables", "sentences_30_syllables", "This sentence has more than 30 syllables. Consider rewriting it to be shorter or splitting it into smaller sentences.", "Readability Issues") sentences_20_syllables = find_limit_offcet(data_text, sentences_20_syllables, "sentences_20_syllables", "sentences_20_syllables", "This sentence has more than 20 syllables. Consider rewriting it to be shorter or splitting it into smaller sentences.", "Readability Issues") text_readability_issues.update(sentences30SyllablesCount=sentences_30_count, sentences20SyllablesCount=sentences_20_count) words_12_letters, words_12_count, words_4_syllables, words_4_count = words_sentence_syllables(divided_text) words_12_letters = find_limit_offcet(data_text, words_12_letters, "words_12_letters", "words_12_letters", "This word is more than 12 letters", "Readability Issues") words_4_syllables = find_limit_offcet(data_text, words_4_syllables, "words_4_syllables", "words_4_syllables", "This word is more than 4 syllables", "Readability Issues") text_readability_issues.update(words4SyllablesCount=words_4_count, words12LettersCount=words_12_count) # Writing Style Issues text_style_issues = dict(passiveVoiceCount=0, passiveVoices=[], adverbsCount=0, adverbs=[], clicheCount=0, cliches=[]) passive_voises_return = find_passives(divided_text) passive_voises_return = find_limit_offcet(data_text, passive_voises_return, "passive_voises", "passive_voises", "Too much of using passive voises", "Writing Style Issues") adverbs_return = find_adverbs(pos_tags_tagger) adverbs_return = find_limit_offcet(data_text, adverbs_return, "adverbs", # writing_style_issues "adverbs", "Too much of using adverbs", "Writing Style Issues") text_style_issues.update(passiveVoiceCount=len(passive_voises_return), adverbsCount=len(adverbs_return)) # Text Density Issues text_density_issues = dict(charactersPerWord=0, syllablesPerWord=0, wordsPerSentence=0, wordsPerParagraph=0, sentencesPerParagraph=0) text_density_issues.update(charactersPerWord=textstat.avg_character_per_word(data_text), syllablesPerWord=textstat.avg_syllables_per_word(data_text), wordsPerSentence=round(textstat.lexicon_count(data_text) / len(divided_text), 2), wordsPerParagraph=round(textstat.lexicon_count(data_text) / paragraphCount, 2), sentencesPerParagraph=round(len(divided_text) / paragraphCount, 2)) # Language Issues text_language_issues = dict(spellingIssuesCount=0, grammarIssueCount=0) matches_limit_offcet = sentences_20_syllables + sentences_30_syllables + words_4_syllables + words_12_letters + \ passive_voises_return + adverbs_return return dict(readabilityGradeLevels=readability_grade_levels, readabilityScores=readability_scores, textStatistics=text_statistics, timings=timings_statistics, textComposition=text_composition, textReadabilityIssues=text_readability_issues, textStyleIssues=text_style_issues, textDensityIssues=text_density_issues, textLanguageIssues=text_language_issues, matches=matches_limit_offcet)
def test_lix(self): score = textstat.lix(self.long_test) self.assertEqual(43.70851063829787, score)
def test_lix(): score = textstat.lix(long_test) assert score == 45.11