def preprocess_text(text):
    """Takes a text, generate features, and returns as dict

    Args:
        text (str): the text to be preprocessed.

    Returns:
        dict: a dictionary of feature names with associated values

    """
    text = _simplify_punctuation(text)

    features = {
        "flesch_reading_ease": textstat.flesch_reading_ease(text),
        "smog_index": textstat.smog_index(text),
        "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
        "coleman_liau_index": textstat.coleman_liau_index(text),
        "automated_readability_index":
        textstat.automated_readability_index(text),
        "dale_chall_readability_score":
        textstat.dale_chall_readability_score(text),
        "difficult_words": textstat.difficult_words(text),
        "linsear_write_formula": textstat.linsear_write_formula(text),
        "gunning_fog": textstat.gunning_fog(text),
        "text_standard": textstat.text_standard(text, float_output=True),
        "mean_parse_tree_depth": get_mean_parse_tree_depth(text),
        "mean_ents_per_sentence": get_mean_ents_per_sentence(text),
    }

    features.update(get_mean_pos_tags(text))

    return features
Exemple #2
0
 def get_readability_scores(self, doc):
     segment = doc.text
     readability_dict = {
         "automated_readability_index":
         textstat.automated_readability_index(segment),
         "coleman_liau_index":
         textstat.coleman_liau_index(segment),
         "dale_chall_readability_score":
         textstat.dale_chall_readability_score(segment),
         "difficult_words":
         textstat.difficult_words(segment),
         "flesch_kincaid_grade":
         textstat.flesch_kincaid_grade(segment),
         "flesch_reading_ease":
         textstat.flesch_reading_ease(segment),
         "gunning_fog":
         textstat.gunning_fog(segment),
         "linsear_write_formula":
         textstat.linsear_write_formula(segment),
         "smog_index":
         textstat.smog_index(segment),
         "text_standard":
         self._convert_text_standard_to_integer(
             textstat.text_standard(segment)),
     }
     return readability_dict
Exemple #3
0
 def form_valid(self, form):
     text = form.cleaned_data['text']
     word_list = wf.tokenize(text, self.lang)
     self.stats = [
         { 'name': 'Flesch-Kincaid grade level',
           'value':  textstat.flesch_kincaid_grade(text),
           'desc': 'Based on avg sentence length and syllables per word.'},
         { 'name': 'Dale-Chall grade level',
           'value': textstat.dale_chall_readability_score_v2(text),
           'desc': 'Based on avg sentence length and percent difficult words.'},
         { 'name': 'Number of words',
           'value': textstat.lexicon_count(text) },
         { 'name': 'Number of sentences',
           'value': textstat.sentence_count(text) },
         { 'name': 'Average sentence length',
           'value': textstat.avg_sentence_length(text) },
         { 'name': 'Average syllables per word',
           'value': textstat.avg_syllables_per_word(text) },
         { 'name': 'Difficult words',
           'value': "%d (%d%%): %s" % (textstat.difficult_words(text),
                                       100*textstat.difficult_words(text)/textstat.lexicon_count(text),
                                       ', '.join(textstat.difficult_words_list(text))) },
     ]
     word_info = {}
     for word in word_list:
         base = base_form(word)
         w = word_info.get(base)
         if w:
             w['count'] += 1
             if word != base and word not in w['alts']:
                 w['alts'].append(word)
         else:
             w = {
                 'hw' : base,
                 'alts' : [],
                 'count' : 1,
                 'freq' : wf.zipf_frequency(base, self.lang)
             }
             if word != base:
                 w['alts'].append(word)
             word_info[base] = w
     self.words = sorted(word_info.values(), key=lambda x: x.get('freq'))
     logger.debug('words: %s', self.words)
     # Don't do normal process of redirecting to success_url.  Just stay on this form page forever.
     return self.render_to_response(self.get_context_data(form=form))
 def post(self, args):
     text = args['text']
     readability = {}
     readability["flesch_reading_ease"] = textstat.flesch_reading_ease(text)
     readability["flesch_kincaid_grade"] = textstat.flesch_kincaid_grade(
         text)
     readability["smog_index"] = textstat.smog_index(text)
     readability["coleman_liau_index"] = textstat.coleman_liau_index(text)
     readability[
         "automated_readability_index"] = textstat.automated_readability_index(
             text)
     readability[
         "dale_chall_readability_score"] = textstat.dale_chall_readability_score(
             text)
     readability["linsear_write_formula"] = textstat.linsear_write_formula(
         text)
     readability["gunning_fog"] = textstat.gunning_fog(text)
     readability["text_standard"] = textstat.text_standard(text)
     readability["difficult_words"] = textstat.difficult_words(text)
     return jsonify(readability)
Exemple #5
0
from textstat import textstat
if __name__ == '__main__':
    test_data = 'The quick brown fox jumps over the lazy dog'

#File to be used to test the function
print(textstat.flesch_reading_ease(test_data))
print(textstat.smog_index(test_data))
print(textstat.flesch_kincaid_grade(test_data))
print(textstat.coleman_liau_index(test_data))
print(textstat.automated_readability_index(test_data))
print(textstat.dale_chall_readability_score(test_data))
print(textstat.difficult_words(test_data))
print(textstat.linsear_write_formula(test_data))
print(textstat.gunning_fog(test_data))
print(textstat.text_standard(test_data))
Exemple #6
0
def index():
    data = request.json
    print(f'Debug: {data}')

    unique_id = data['unique_id']
    process_language = data['process_language']
    message = data['message']

    matches_list = None
    if process_language:
        # Language tool takes a while to process
        language_tool = LanguageTool('en-US')
        matches: list[Match] = language_tool.check(message)

        matches_list = []
        for match in matches:
            matches_list.append(match_to_dict(match))
        print(f'Analysis finished: {matches_list}')

    sentences: list = splitter.split(text=message)

    return {
        'unique_id': unique_id,
        'text_statistics': {
            'lexicon_count': textstat.lexicon_count(message),
            'lexicon_count_ps': list_map(sentences, textstat.lexicon_count),
            'syllable_count': textstat.syllable_count(message),
            'syllable_count_ps': list_map(sentences, textstat.syllable_count),
            'sentences': sentences,
            'sentence_count': len(sentences),
            'readability': {
                'flesch_reading_ease': {
                    'score': textstat.flesch_reading_ease(message),
                    'sps': list_map(sentences, textstat.flesch_reading_ease)
                },
                'smog_index': {
                    'score': textstat.smog_index(message)
                },
                'flesch_kincaid_grade': {
                    'score': textstat.flesch_kincaid_grade(message),
                    'sps': list_map(sentences, textstat.flesch_kincaid_grade)
                },
                'coleman_liau_index': {
                    'score': textstat.coleman_liau_index(message),
                    'sps': list_map(sentences, textstat.coleman_liau_index)
                },
                'automated_readability_index': {
                    'score':
                    textstat.automated_readability_index(message),
                    'sps':
                    list_map(sentences, textstat.automated_readability_index)
                },
                'dale_chall_readability_score': {
                    'score':
                    textstat.dale_chall_readability_score(message),
                    'sps':
                    list_map(sentences, textstat.dale_chall_readability_score)
                },
                'difficult_words': {
                    'score': textstat.difficult_words(message),
                    'sps': list_map(sentences, textstat.difficult_words),
                    'words': textstat.difficult_words_list(message)
                },
                'linsear_write_formula': {
                    'score': round(textstat.linsear_write_formula(message), 2),
                    'sps': list_map(sentences, textstat.linsear_write_formula)
                },
                'gunning_fog': {
                    'score': textstat.gunning_fog(message),
                    'sps': list_map(sentences, textstat.gunning_fog)
                },
                'text_standard': {
                    'score': textstat.text_standard(message)
                }
            }
        },
        'language_tool': matches_list
    }