Ejemplo n.º 1
0
def get_special_metrics(text):
    blob = TextBlob(text)
    main = {
        "statistics": {
            "syllables": textstat.syllable_count(text),
            "words": textstat.lexicon_count(text),
            "characters": textstat.char_count(text),
            "polysyllables": textstat.polysyllabcount(text),
            "average letter per word": textstat.avg_letter_per_word(text),
            "average sentence length": textstat.avg_sentence_length(text),
            "average sentence per word": textstat.avg_sentence_per_word(text),
            "sentences": textstat.sentence_count(text),
        },
        "difficulty": {
            "flesch reading ease": textstat.flesch_reading_ease(text),
            "smog index": textstat.smog_index(text),
            "flesch kincaid grade": textstat.flesch_kincaid_grade(text),
            "coleman liau index": textstat.coleman_liau_index(text),
            #'automated readability index': textstat.automated_readability_index(text),
            #'dale chall readability score': textstat.dale_chall_readability_score(text),
            #'difficult words': textstat.difficult_words(text),
            #'linsear write formula': textstat.linsear_write_formula(text),
            "gunning fog": textstat.gunning_fog(text),
        },
        "sentiments": {"polarity": blob.sentiment.polarity, "subjectivity": blob.sentiment.subjectivity},
    }

    return main
Ejemplo n.º 2
0
def get_special_metrics(text):
    blob = TextBlob(text)
    main = {
        'statistics': {
            'syllables': textstat.syllable_count(text),
            'words': textstat.lexicon_count(text),
            'characters': textstat.char_count(text),
            'polysyllables': textstat.polysyllabcount(text),
            'average letter per word': textstat.avg_letter_per_word(text),
            'average sentence length': textstat.avg_sentence_length(text),
            'average sentence per word': textstat.avg_sentence_per_word(text),
            'sentences': textstat.sentence_count(text)
        },
        'difficulty': {
            'flesch reading ease': textstat.flesch_reading_ease(text),
            'smog index': textstat.smog_index(text),
            'flesch kincaid grade': textstat.flesch_kincaid_grade(text),
            'coleman liau index': textstat.coleman_liau_index(text),
            #'automated readability index': textstat.automated_readability_index(text),
            #'dale chall readability score': textstat.dale_chall_readability_score(text),
            #'difficult words': textstat.difficult_words(text),
            #'linsear write formula': textstat.linsear_write_formula(text),
            'gunning fog': textstat.gunning_fog(text)
        },
        'sentiments': {
            'polarity': blob.sentiment.polarity,
            'subjectivity': blob.sentiment.subjectivity
        }
    }

    return main
Ejemplo n.º 3
0
    def _calculate_scores(self, docs):
        docs_scores = []

        for doc in docs:
            scores = {}
            scores['chars'] = ts.char_count(doc)
            scores['words'] = ts.lexicon_count(doc)
            scores['sents'] = ts.sentence_count(doc)
            #scores['syllables'] = ts.syllable_count(doc)
            scores['avg_sent_length'] = ts.avg_sentence_length(doc)
            scores['avg_syllables_per_word'] = ts.avg_syllables_per_word(doc)
            scores['avg_letters_per_word'] = ts.avg_letter_per_word(doc)
            scores['flesch'] = ts.flesch_reading_ease(doc)
            #scores['smog'] = ts.smog_index(doc)
            #scores['coleman_liau'] = ts.coleman_liau_index(doc)
            scores['automated_readability'] = ts.automated_readability_index(
                doc)
            #scores['linsear'] = ts.linsear_write_formula(doc)
            #scores['difficult_words'] = ts.difficult_words(doc)
            scores['dale_chall'] = ts.dale_chall_readability_score(doc)
            #scores['gunning_fog'] = ts.gunning_fog(doc)
            scores['lix'] = ts.lix(doc)
            docs_scores.append(scores)

        return docs_scores
Ejemplo n.º 4
0
def composition(text, file):
    char_count = textstat.char_count(text)
    syll_count = textstat.syllable_count(text)
    lex_count = textstat.lexicon_count(text)
    sent_count = textstat.sentence_count(text)
    file.write(
        '\nChar count : %d\nSyllabus count : %d \nLexicon count : %d \nSentence count : %d'
        % (char_count, syll_count, lex_count, sent_count))
Ejemplo n.º 5
0
def scores_cal_ori(text):

              char_count_value=textstat.char_count(text,ignore_spaces=True)
              lexicon_count_value=textstat.lexicon_count(text,removepunct=True)
              syllable_count_value=textstat.syllable_count(text)
              sentence_count_value=textstat.sentence_count(text)
              avg_sentence_length_value=textstat.avg_sentence_length(text)
              avg_syllables_per_word_value=textstat.avg_syllables_per_word(text)
              avg_letter_per_word_value=textstat.avg_letter_per_word(text)
              avg_sentence_per_word_value=textstat.avg_sentence_per_word(text)
              flesch_kincaid_grade_value=textstat.flesch_kincaid_grade(text)
              smog_index_value=textstat.smog_index(text)
              gunning_fog_value=textstat.gunning_fog(text)
              difficult_words_value=textstat.difficult_words(text)
              dale_chall_value=textstat.dale_chall_readability_score(text)
              polysyllab_value=textstat.polysyllabcount(text)
              return char_count_value,lexicon_count_value,syllable_count_value,sentence_count_value,avg_sentence_length_value,avg_syllables_per_word_value,avg_letter_per_word_value,avg_sentence_per_word_value,flesch_kincaid_grade_value,smog_index_value,gunning_fog_value,difficult_words_value,dale_chall_value,polysyllab_value
              return smog_index_value
Ejemplo n.º 6
0
 def stats(self, text):
     test_data = text
     stats = {}
     stats['flesch_reading_ease'] = textstat.flesch_reading_ease(test_data)
     stats['smog'] = textstat.smog_index(test_data)
     stats['flesch kincaid'] = textstat.flesch_kincaid_grade(test_data)
     stats['coleman Liau'] = textstat.coleman_liau_index(test_data)
     stats['automated'] = textstat.automated_readability_index(test_data)
     stats['dale chall'] = textstat.dale_chall_readability_score(test_data)
     stats['difficult'] = textstat.difficult_words(test_data)
     stats['linsear'] = textstat.linsear_write_formula(test_data)
     stats['gunning_fog'] = textstat.gunning_fog(test_data)
     stats['standard'] = textstat.text_standard(test_data)
     stats['charcount'] = textstat.char_count(test_data)
     stats['lexicon count'] = textstat.lexicon_count(test_data)
     stats['syllable count'] = textstat.syllable_count(test_data)
     stats['sentence count'] = textstat.sentence_count(test_data)
     stats['avg sentence length'] = textstat.avg_sentence_length(test_data)
     stats['avg_syllables_per_word'] = textstat.avg_syllables_per_word(
         test_data)
     stats['avg_letter_per_word'] = textstat.avg_letter_per_word(test_data)
     stats['avg_sentence_per_word'] = textstat.avg_sentence_per_word(
         test_data)
     return stats
Ejemplo n.º 7
0
    def updateData(self):

        # Full list of polarity scores
        self.polscore = self.sid.polarity_scores(self.text)

        ##### INDEX 0 IN DATA: Text Sentiment #####
        # [INDEX 0] Compounded score (0.0 - 1.0)            [INDEX 1] Negative connotation rating (0.0 - 1.0),
        # [INDEX 2] Positive connotation rating (0.0 - 1.0) [INDEX 3] Neutral connotation rating (0.0 - 1.0)
        self.data.append([
            self.polscore['compound'], self.polscore['neg'],
            self.polscore['pos'], self.polscore['neu']
        ])

        ##### INDEX 1 IN DATA: Sentence Info #####
        # [INDEX 0] Sentence count          [INDEX 1] Average sentence length
        # [INDEX 2] Syllable count          [INDEX 3] Overall word count
        # [INDEX 4] Character count         [INDEX 5] Character count without spaces
        # [INDEX 6] Avg letters per word    [INDEX 7] Avg syllables per word
        self.data.append([
            textstat.sentence_count(self.text),
            textstat.avg_sentence_length(self.text),
            textstat.syllable_count(self.text),
            len(self.splList),
            textstat.char_count(self.text, False),
            textstat.char_count(self.text, True),
            textstat.avg_letter_per_word(self.text),
            textstat.avg_syllables_per_word(self.text)
        ])

        ##### INDEX 2 IN DATA: Flesch Reading Ease #####
        # [INDEX 0] Pure score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 100
        self.freRaw = textstat.flesch_reading_ease(self.text)
        self.freStat = min(max(self.freRaw, 0), 100)
        self.data.append([
            round(self.freStat, 3),
            self.freGrade(self.freStat),
            round(abs(self.freStat - 100), 2)
        ])

        ##### INDEX 3 IN DATA: Flesch-Kincaid Grade #####
        # [INDEX 0] Pure score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.fkgRaw = textstat.flesch_kincaid_grade(self.text)
        self.fkgStat = self.adjustScore(self.fkgRaw)
        self.data.append([
            round(self.fkgStat, 3),
            self.grade(self.fkgStat),
            round(self.fkgStat / 0.18, 2)
        ])

        ##### INDEX 4 IN DATA: Gunning FOG Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.fogRaw = textstat.gunning_fog(self.text)
        self.fogStat = self.adjustScore(self.fogRaw)
        self.data.append([
            round(self.fogStat, 3),
            self.grade(self.fogStat),
            round(self.fogStat / 0.18, 2)
        ])

        ##### INDEX 5 IN DATA: SMOG Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.smogRaw = textstat.smog_index(self.text)
        self.smogStat = self.adjustScore(self.smogRaw)
        self.data.append([
            round(self.smogStat, 3),
            self.grade(self.smogStat),
            round(self.smogStat / 0.18, 2)
        ])

        ##### INDEX 6 IN DATA: Automated Readability Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 14
        self.ariRaw = textstat.automated_readability_index(self.text)
        self.ariStat = min(max(self.ariRaw, 0), 14)
        self.data.append([
            round(self.ariStat, 3),
            self.ariGrade(ceil(self.ariStat)),
            round(self.ariStat / 0.14, 2)
        ])  #13

        ##### INDEX 7 IN DATA: Coleman-Liau Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.cliRaw = textstat.coleman_liau_index(self.text)
        self.cliStat = self.adjustScore(self.cliRaw)
        self.data.append([
            round(self.cliStat, 3),
            self.grade(self.cliStat),
            round(self.cliStat / 0.18, 2)
        ])

        ##### INDEX 8 IN DATA: Linsear Write Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.lwiRaw = textstat.linsear_write_formula(self.text)
        self.lwiStat = self.adjustScore(self.lwiRaw)
        self.data.append([
            round(self.lwiStat, 3),
            self.grade(self.lwiStat),
            round(self.lwiStat / 0.18, 2)
        ])

        ##### INDEX 9 IN DATA: Dale-Chall Readability Score #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 10
        self.dcrRaw = textstat.dale_chall_readability_score(self.text)
        self.dcrStat = min(max(self.dcrRaw, 0), 10)
        self.data.append([
            round(self.dcrStat, 3),
            self.daleChallGrade(self.dcrStat),
            round(self.dcrStat / 0.1, 2)
        ])

        ##### INDEX 10 IN DATA: Overall Score #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 20
        self.txtRaw = textstat.text_standard(self.text, True)
        self.txtStd = min(max(self.txtRaw, 0), 20)
        self.txtInfo = textstat.text_standard(self.text)
        self.data.append([
            round(self.txtStd, 3),
            self.txtGrade(self.txtStd, self.txtInfo),
            round(self.txtStd / 0.2, 2)
        ])

        return self.data
Ejemplo n.º 8
0
    def test_char_count(self):
        count = textstat.char_count(self.long_test)
        count_spaces = textstat.char_count(self.long_test, ignore_spaces=False)

        self.assertEqual(1750, count)
        self.assertEqual(2123, count_spaces)
Ejemplo n.º 9
0
def preprocess(x):
    print('PROCESSING ID: ' + str(x['id']))
    try:
        fvec = []
        fvec.append(int(x['id']))  # Append Article ID
        fvec.append(nnp_num(x['targetTitle']))
        if len(x['targetParagraphs']) > 0:
            fvec.append(
                ts.automated_readability_index(' '.join(
                    x['targetParagraphs'])))
            fvec.append(ts.avg_letter_per_word(' '.join(
                x['targetParagraphs'])))
            fvec.append(ts.avg_sentence_length(' '.join(
                x['targetParagraphs'])))
            fvec.append(
                ts.avg_sentence_per_word(' '.join(x['targetParagraphs'])))
            fvec.append(
                ts.avg_syllables_per_word(' '.join(x['targetParagraphs'])))
            fvec.append(ts.char_count(' '.join(x['targetParagraphs'])))
            fvec.append(ts.coleman_liau_index(' '.join(x['targetParagraphs'])))
            fvec.append(
                ts.dale_chall_readability_score(' '.join(
                    x['targetParagraphs'])))
            fvec.append(ts.difficult_words(' '.join(x['targetParagraphs'])))
            fvec.append(
                ts.flesch_kincaid_grade(' '.join(x['targetParagraphs'])))
            fvec.append(ts.flesch_reading_ease(' '.join(
                x['targetParagraphs'])))
            fvec.append(ts.gunning_fog(' '.join(x['targetParagraphs'])))
            fvec.append(ts.lexicon_count(' '.join(x['targetParagraphs'])))
            fvec.append(
                ts.linsear_write_formula(' '.join(x['targetParagraphs'])))
            fvec.append(ts.polysyllabcount(' '.join(x['targetParagraphs'])))
            fvec.append(ts.sentence_count(' '.join(x['targetParagraphs'])))
            fvec.append(ts.smog_index(' '.join(x['targetParagraphs'])))
            fvec.append(ts.syllable_count(' '.join(x['targetParagraphs'])))
            fvec.append(mean_wordlen(x['targetParagraphs']))
            fvec += ratio(x['targetParagraphs'])  #36
            fvec += ngram_feat(x['targetParagraphs'])  # 6
        else:
            fvec += [0] * 61
        if len(word_tokenize(' '.join(x['postText']))) > 0:
            fvec.append(max_wordlen(x['postText']))
            fvec.append(sw_ratio(' '.join(x['postText'])))
            fvec += ngram_feat(x['postText'])  #6
        else:
            fvec += [0] * 8
        fvec.append(len(word_tokenize(x['targetTitle'])))
        fvec.append(wlen_title(x['targetTitle']))
        fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'NNP'))
        fvec.append(int(num_start(x['targetTitle'])))
        fvec.append(in_num(x['targetTitle']))
        fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'VBZ'))
        fvec.append(pos_2gram(x['targetTitle'], 'IN', 'NNP'))
        fvec.append(wrb_num(x['targetTitle']))
        fvec.append(nnp_num(x['targetTitle']))
        fvec.append(int(wh_start(x['targetTitle'])))
        fvec.append(int(qm_exist(x['targetTitle'])))
        fvec.append(pos_thnn(x['targetTitle']))
        fvec.append(prp_count(x['targetTitle']))
        fvec.append(vbz_count(x['targetTitle']))
        fvec.append(pos_3gram(x['targetTitle'], 'NNP', 'NNP', 'VBZ'))
        fvec.append(pos_2gram(x['targetTitle'], 'NN', 'IN'))
        fvec.append(pos_3gram(x['targetTitle'], 'NN', 'IN', 'NNP'))
        fvec.append(pos_2gram(x['targetTitle'], 'NNP', '.'))
        fvec.append(pos_2gram(x['targetTitle'], 'PRP', 'VBP'))
        fvec.append(wp_count(x['targetTitle']))
        fvec.append(dt_count(x['targetTitle']))
        fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'IN'))
        fvec.append(pos_3gram(x['targetTitle'], 'IN', 'NNP', 'NNP'))
        fvec.append(pos_count(x['targetTitle']))
        fvec.append(pos_2gram(x['targetTitle'], 'IN', 'NN'))
        if len(x['targetKeywords']) > 0 and len(x['postText']) > 0:
            fvec.append(kw_post_match(x['targetKeywords'], x['postText']))
        else:
            fvec += [0] * 1
        fvec.append(comma_count(x['targetTitle']))
        fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'NNS'))
        fvec.append(pos_2gram(x['targetTitle'], 'IN', 'JJ'))
        fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'POS'))
        fvec.append(wdt_count(x['targetTitle']))
        fvec.append(pos_2gram(x['targetTitle'], 'NN', 'NN'))
        fvec.append(pos_2gram(x['targetTitle'], 'NN', 'NNP'))
        fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'VBD'))
        fvec.append(rb_count(x['targetTitle']))
        fvec.append(pos_3gram(x['targetTitle'], 'NNP', 'NNP', 'NNP'))
        fvec.append(pos_3gram(x['targetTitle'], 'NNP', 'NNP', 'NN'))
        fvec.append(rbs_count(x['targetTitle']))
        fvec.append(vbn_count(x['targetTitle']))
        fvec.append(pos_2gram(x['targetTitle'], 'VBN', 'IN'))
        fvec.append(pos_2gram(x['targetTitle'], 'JJ', 'NNP'))
        fvec.append(pos_3gram(x['targetTitle'], 'NNP', 'NN', 'NN'))
        fvec.append(pos_2gram(x['targetTitle'], 'DT', 'NN'))
        fvec.append(ex_exist(x['targetTitle']))
        fvec += ngram_feat(x['targetTitle'])  #6
    except Exception as e:
        print('EXCEPTION AT ID ' + str(x['id']))
        print(e)
        sys.exit()

    return fvec
            re.findall(r'\b(he|she|it|his|hers|him|her|they|them|their)\b',
                       str(tokens),
                       flags=re.I))
        prn_density = round(prn / len(tokens_np), 5)
        try:
            prn_noun_ratio = round(prn / len(nouns), 2)
        except ZeroDivisionError:
            prn_noun_ratio = 0

        ## Readability features

        num_syllab = textstat.syllable_count(essay)
        avg_len_sent = textstat.avg_sentence_length(essay)
        #         avg_sent_per_word = textstat.avg_sentence_per_word(essay)
        #         num_polysyllab = textstat.polysyllabcount(essay)
        num_chars = textstat.char_count(essay, ignore_spaces=True)
        #         avg_syllab_per_word = textstat.avg_syllables_per_word(essay)

        fre = textstat.flesch_reading_ease(essay)
        fkg = textstat.flesch_kincaid_grade(essay)
        cli = textstat.coleman_liau_index(essay)
        ari = textstat.automated_readability_index(essay)
        dcrs = textstat.dale_chall_readability_score(essay)
        dw = textstat.difficult_words(essay)
        lwf = textstat.linsear_write_formula(essay)
        gf = textstat.gunning_fog(essay)

        ## Stages of negation (features to improve validity for AES in ELL contexts)

        stage1a = len(
            re.findall(