Exemple #1
0
def test_char_count():
    textstat.set_lang("en_US")
    count = textstat.char_count(long_test)
    count_spaces = textstat.char_count(long_test, ignore_spaces=False)

    assert count == 1750
    assert count_spaces == 2123
Exemple #2
0
def test_char_count():
    count = textstat.char_count(long_test)
    count_spaces = textstat.char_count(
        long_test, ignore_spaces=False
    )

    assert count == 1750
    assert count_spaces == 2123
 def __init__(self, text):
     self.__doc = self.preprocess(text)
     self.__docWords = self.getTotalWords()
     self.__totalWords = textstat.lexicon_count(self.__doc, removepunct=True)
     self.__totalCharacters = textstat.char_count(self.__doc, ignore_spaces=True)
     self.__totalSentences = self.getSentencesCount()
     self.__totalSyllables = self.getSyllablesCount()
     # self.__totalSyllables = textstat.syllable_count(self.__doc)
     self.__polySyllableCount = self.getPolySyllableCount()
Exemple #4
0
    def test_char_count(self):
        count = textstat.char_count(self.long_test)
        count_spaces = textstat.char_count(self.long_test, ignore_spaces=False)

        self.assertEqual(1750, count)
        self.assertEqual(2123, count_spaces)
def get_redability_assessments(data_text: str) -> Optional[dict]:
    divided_text = tokenize.sent_tokenize(data_text)
    word_tokenizes = nltk.word_tokenize(data_text)
    pos_tags = nltk.pos_tag(word_tokenizes)
    pos_tags_tagger = TAGGER.tag(word_tokenizes)
    f_dist = nltk.FreqDist(word_tokenizes)

    uniqueWordCount = compute_unique_word_count(f_dist.most_common())

    paragraphCount = max(len(data_text.split('\n')), len(data_text.split('\r\n')))

    counts = Counter(tag for word, tag in pos_tags)

    # Readability Grade Levels
    readability_grade_levels = dict(fleschKincaid=0, gunningFog=0, colemanLiau=0, smog=0,
                                    ari=0, forecastGradeLevel=0, powersSumnerKearlGrade=0, rix=0,
                                    raygorReadability=0, fryReadability=0, flesch=0)

    readability_grade_levels.update(fleschKincaid=textstat.flesch_kincaid_grade(data_text))
    readability_grade_levels.update(gunningFog=textstat.gunning_fog(data_text))
    readability_grade_levels.update(colemanLiau=textstat.coleman_liau_index(data_text))
    readability_grade_levels.update(smog=textstat.smog_index(data_text))
    readability_grade_levels.update(ari=textstat.automated_readability_index(data_text))
    readability_grade_levels.update(rix=textstat.rix(data_text))

    # need to check
    readability_grade_levels.update(forcastGradeLevel=round(20 - (textstat.avg_syllables_per_word(data_text) / 10), 2))

    readability_grade_levels.update(powersSumnerKearlGrade=round(textstat.avg_sentence_length(data_text) +
                                                                 textstat.avg_syllables_per_word(data_text) +
                                                                 2.7971, 2))
    readability_grade_levels.update(raygorReadability=count_raygor_readability(divided_text))
    readability_grade_levels.update(fryReadability=count_fry_readability(divided_text))
    # need to check

    readability_grade_levels.update(flesch=textstat.flesch_reading_ease(data_text))

    # Readability Scores
    readability_scores = dict(readableRating="", fleschReadingEase=0, cefrLevel='', ieltsLevel='', spacheScore=0,
                              newDaleChallScore=0, lixReadability=0, lensearWrite=0)
    readability_scores.update(readableRating=count_average_grade_levels(readability_grade_levels))
    readability_scores.update(fleschReadingEase=textstat.flesch_reading_ease(data_text))
    readability_scores.update(cefrLevel=count_cefr_levels(readability_grade_levels))
    readability_scores.update(ieltsLevel=count_ielts_levels(readability_grade_levels))
    readability_scores.update(spacheScore=round(textstat.spache_readability(data_text), 2))
    readability_scores.update(newDaleChallScore=textstat.dale_chall_readability_score_v2(data_text))
    readability_scores.update(lixReadability=textstat.lix(data_text))
    readability_scores.update(lensearWrite=textstat.linsear_write_formula(data_text))

    # Text Statistics
    text_statistics = dict(characterCount=0, syllableCount=0, wordCount=0, uniqueWordCount=0,
                           sentenceCount=0, paragraphCount=0)
    text_statistics.update(characterCount=textstat.char_count(data_text))
    text_statistics.update(syllableCount=textstat.syllable_count(data_text))
    text_statistics.update(wordCount=textstat.lexicon_count(data_text))
    text_statistics.update(uniqueWordCount=uniqueWordCount)
    text_statistics.update(sentenceCount=textstat.sentence_count(data_text))
    text_statistics.update(paragraphCount=paragraphCount)

    # Timings
    timings_statistics = dict(readingTime=0, speakingTime=0)
    timings_statistics.update(readingTime=reading_time(textstat.lexicon_count(data_text)))
    timings_statistics.update(speakingTime=speaking_time(textstat.lexicon_count(data_text)))

    # Text Composition
    text_composition = dict(adjectives=0, adverbs=0, conjunctions=0, determiners=0, interjections=0, nouns=0, verbs=0,
                            properNouns=0, prepositions=0, pronouns=0, qualifiers=0, unrecognised=0, nonWords=0)

    text_composition.update(adjectives=counts.get('JJ', 0) + counts.get('JJR', 0) + counts.get('JJS', 0))
    text_composition.update(adverbs=counts.get('RB', 0) + counts.get('RBR', 0) + counts.get('RBS', 0))
    text_composition.update(conjunctions=counts.get('CC', 0))
    text_composition.update(determiners=counts.get('DT', 0) + counts.get('PDT', 0) + counts.get('WDT', 0))
    text_composition.update(interjections=counts.get('UH', 0))
    text_composition.update(nouns=counts.get('NN', 0) + counts.get('NNS', 0))
    text_composition.update(
        verbs=counts.get('VB', 0) + counts.get('VBD', 0) + counts.get('VBG', 0) + counts.get('VBN', 0) + counts.get(
            'VBP', 0) + counts.get('VBZ', 0))
    text_composition.update(properNouns=counts.get('NNP', 0) + counts.get('NNPS', 0))
    text_composition.update(prepositions=counts.get('IN', 0))
    text_composition.update(
        pronouns=counts.get('PRP', 0) + counts.get('PRP$', 0) + counts.get('WP', 0) + counts.get('WP$', 0))
    text_composition.update(qualifiers=counts.get('RB', 0))
    text_composition.update(unrecognised=counts.get(None, 0))
    text_composition.update(nonWords=counts.get('.', 0) + counts.get(',', 0) + counts.get(':', 0))

    # Readability Issues
    text_readability_issues = dict(sentences30SyllablesCount=0, sentences20SyllablesCount=0,
                                   sentences30Syllables=[], sentences20Syllables=[],
                                   words4SyllablesCount=0, words12LettersCount=0,
                                   words4Syllables=[], words12Letters=[])

    sentences_30_syllables, sentences_30_count, sentences_20_syllables, sentences_20_count = count_sentences_syllables(
        divided_text)

    sentences_30_syllables = find_limit_offcet(data_text, sentences_30_syllables,
                                               "sentences_30_syllables",
                                               "sentences_30_syllables",
                                               "This sentence has more than 30 syllables. Consider rewriting it to be shorter or splitting it into smaller sentences.",
                                               "Readability Issues")
    sentences_20_syllables = find_limit_offcet(data_text, sentences_20_syllables,
                                               "sentences_20_syllables",
                                               "sentences_20_syllables",
                                               "This sentence has more than 20 syllables. Consider rewriting it to be shorter or splitting it into smaller sentences.",
                                               "Readability Issues")

    text_readability_issues.update(sentences30SyllablesCount=sentences_30_count,
                                   sentences20SyllablesCount=sentences_20_count)

    words_12_letters, words_12_count, words_4_syllables, words_4_count = words_sentence_syllables(divided_text)

    words_12_letters = find_limit_offcet(data_text, words_12_letters,
                                         "words_12_letters",
                                         "words_12_letters",
                                         "This word is more than 12 letters",
                                         "Readability Issues")
    words_4_syllables = find_limit_offcet(data_text, words_4_syllables,
                                          "words_4_syllables",
                                          "words_4_syllables",
                                          "This word is more than 4 syllables",
                                          "Readability Issues")

    text_readability_issues.update(words4SyllablesCount=words_4_count,
                                   words12LettersCount=words_12_count)

    # Writing Style Issues
    text_style_issues = dict(passiveVoiceCount=0, passiveVoices=[],
                             adverbsCount=0, adverbs=[],
                             clicheCount=0, cliches=[])
    passive_voises_return = find_passives(divided_text)
    passive_voises_return = find_limit_offcet(data_text, passive_voises_return,
                                              "passive_voises",
                                              "passive_voises",
                                              "Too much of using passive voises",
                                              "Writing Style Issues")
    adverbs_return = find_adverbs(pos_tags_tagger)
    adverbs_return = find_limit_offcet(data_text, adverbs_return,
                                       "adverbs",  # writing_style_issues
                                       "adverbs",
                                       "Too much of using adverbs",
                                       "Writing Style Issues")
    text_style_issues.update(passiveVoiceCount=len(passive_voises_return),
                             adverbsCount=len(adverbs_return))

    # Text Density Issues
    text_density_issues = dict(charactersPerWord=0, syllablesPerWord=0, wordsPerSentence=0,
                               wordsPerParagraph=0, sentencesPerParagraph=0)

    text_density_issues.update(charactersPerWord=textstat.avg_character_per_word(data_text),
                               syllablesPerWord=textstat.avg_syllables_per_word(data_text),
                               wordsPerSentence=round(textstat.lexicon_count(data_text) / len(divided_text), 2),
                               wordsPerParagraph=round(textstat.lexicon_count(data_text) / paragraphCount, 2),
                               sentencesPerParagraph=round(len(divided_text) / paragraphCount, 2))

    # Language Issues
    text_language_issues = dict(spellingIssuesCount=0, grammarIssueCount=0)

    matches_limit_offcet = sentences_20_syllables + sentences_30_syllables + words_4_syllables + words_12_letters + \
                           passive_voises_return + adverbs_return

    return dict(readabilityGradeLevels=readability_grade_levels,
                readabilityScores=readability_scores,
                textStatistics=text_statistics,
                timings=timings_statistics,
                textComposition=text_composition,
                textReadabilityIssues=text_readability_issues,
                textStyleIssues=text_style_issues,
                textDensityIssues=text_density_issues,
                textLanguageIssues=text_language_issues,
                matches=matches_limit_offcet)
Exemple #6
0
#Read in data
data = pd.read_csv("INPUT_2.txt",
                   sep='\t',
                   header=None,
                   names=['id', 'target', 'tweet'],
                   encoding='utf-8')
data.drop(['id', 'target'], axis=1, inplace=True)

#Total number of tweets
print("Total number of tweets is ", len(data))

#Total number of characters
char = 0
i = 0
while i < len(data):
    char = char + textstat.char_count(data['tweet'][i])
    i += 1
print("Total number of characters is ", char)

#Total number of distinct words
twt = TweetTokenizer(strip_handles=True, reduce_len=True)
words = []
i = 0
while i < len(data):
    for word in twt.tokenize(data['tweet'][i]):
        words.append(word)
    i += 1

print("Total number of distinct words is ", len(set(words)))

#Avg number of of characters in each tweet
Exemple #7
0
def test_char_count():
    count = textstat.char_count(long_test)
    count_spaces = textstat.char_count(long_test, ignore_spaces=False)

    assert count == 1750
    assert count_spaces == 2123
Exemple #8
0
             characters_based_features['number_of_close_paranthesis'] += 1
         if k == '()':
             characters_based_features['number_of_pair_paranthesis'] += 1
         if k == '[':
             characters_based_features['number_of_open_squareBracket'] += 1
         if k == ']':
             characters_based_features['number_of_close_squareBracket'] += 1
         if k == '[]':
             characters_based_features['number_of_pair_squareBracket'] += 1
         if k == '{':
             characters_based_features['number_of_open_curlyBracket'] += 1
         if k == '}':
             characters_based_features['number_of_close_curlyBracket'] += 1
         if k == '{}':
             characters_based_features['number_of_pair_curlyBracket'] += 1
 characters_based_features['number_of_characters'] = textstat.char_count(
     i, ignore_spaces=False)
 characters_based_features['number_of_quotation'] = len(
     re.compile(r"\"").findall(i))
 characters_based_features['number_of_numbersigns'] = len(
     re.compile(r"\#").findall(i))
 characters_based_features['number_of_dollar'] = len(
     re.compile(r"\$").findall(i))
 characters_based_features['number_of_percent'] = len(
     re.compile(r"\%").findall(i))
 characters_based_features['number_of_ampersand'] = len(
     re.compile(r"\&").findall(i))
 characters_based_features['number_of_asterisk'] = len(
     re.compile(r"\*").findall(i))
 characters_based_features['number_of_plus'] = len(
     re.compile(r"\+").findall(i))
 characters_based_features['number_of_minus'] = len(