Beispiel #1
0
def calcFeatures(params):
    index, rev = params  # Multiprocessing...
    global rev_xl
    filename = "insert data path of the 2015 data from https://figshare.com/articles/English_Wikipedia_Quality_Asssessment_Dataset/1375406" + str(
        rev['revid'])
    if (os.path.exists(filename)):
        print(rev['revid'])
        text = util.read_file(filename)
        text = util.cleanhtml(text)
        text = text.replace('\'\'\'', '')
        assert rev['pageid'] == rev_xl.iloc[index, 0]
        print("matched ", rev['revid'])

        calc = readcalc.ReadCalc(text)
        textual_score = list(calc.get_all_metrics())

        text_stat = textstatistics()
        linsear_write_formula = round(text_stat.linsear_write_formula(text),2)
        textual_score.append(linsear_write_formula)

        grammar_score = len(tool.check(text))
        textual_score.append(grammar_score)

        rev_xl.iloc[index, 14:36] = textual_score

        print(rev_xl.iloc[index, :])

        if index % 10 == 0:
            rev_xl.to_csv(path)
Beispiel #2
0
def grade_essay():
    data = flask.request.json
    calc = readcalc.ReadCalc(data["essay"])
    ari_score = round(calc.get_ari_index(),2)
    polarity = TextBlob(data["essay"]).polarity
    polarity = round(polarity*10,1)
    results = {"readability": ari_score, "polarity": polarity }
    return flask.jsonify(results)
Beispiel #3
0
    def score(eula):
        name = 'Document Length'
        text = eula.text
        calc = readcalc.ReadCalc(text)
        words = len(calc.get_words())
        description = 'Counts the number of words in the EULA'
        feedback = [{
            'rating': 3,
            'text': 'This EULA is {0} words long'.format(words)
        }]

        if words < 1200:
            score = 4
            grade = 'A'
            feedback.append({
                'rating': 2,
                'text': 'This EULA is short and succinct'
            })
        elif words < 1700:
            score = 3
            grade = 'B'
            feedback.append({
                'rating': 1,
                'text': 'This EULA is relatively concise'
            })
        elif words < 2000:
            score = 2
            grade = 'C'
            feedback.append({'rating': 1, 'text': 'This EULA is a bit long'})
        elif words < 2500:
            score = 1
            grade = 'D'
            feedback.append({'rating': 0, 'text': 'This EULA is very long'})
        else:
            score = 0
            grade = 'F'
            feedback.append({'rating': 0, 'text': 'This EULA is too long'})

        #The tool will assign a grade of F for EULAs exceeding 2,500 words, D for 2,000 to 2,499 words,
        # C for 1,700 to 1,999 words, B for 1,200 to 1,699 words, and A for fewer than 1,200 words.

        return {
            'name': name,
            'grade': grade,
            'description': description,
            'feedback': feedback,
            'score': score,
            'max': 4,
            'numwords': words
        }
Beispiel #4
0
def score():
    data = flask.request.json
    x = np.matrix(data["example"])
    score = PREDICTOR.predict_proba(x)
    a = data["example"][9:37]
    b = data["example"][41:45]
    c = data["example"][48:54]
    short_x = np.concatenate((a,b,c))
    short_x = tuple(short_x)
    ind = pd.DataFrame(completed_df[' essay'].loc[completed_df['features_together'] == short_x])
    text = ind[' essay'].values[-1]
    calc = readcalc.ReadCalc(text)
    polarity = TextBlob(text).polarity
    polarity = round(polarity*10,1)
    user_readability = round(calc.get_ari_index(),1)
    results = {"score": score[0][1], "project_text": text, "user_readability": user_readability, "neighbor_polarity": polarity }
    return flask.jsonify(results)
def compute_readability(df, col):
    '''computes the readability measures of text
  input:
  df = inpute dataframe 
  col = column of inpute dataframe for which the readability scores will bee calculated'''

    for ind, row in df.iterrows():
        calc = readcalc.ReadCalc(row[col])
        df.loc[ind, 'smog_index'] = calc.get_smog_index()
        df.loc[ind, 'flesch_reading_ease'] = calc.get_flesch_reading_ease()
        df.loc[
            ind,
            'flesch_kincaid_grade_level'] = calc.get_flesch_kincaid_grade_level(
            )
        df.loc[ind, 'coleman_liau_index'] = calc.get_coleman_liau_index()
        df.loc[ind, 'gunning_fog_index'] = calc.get_gunning_fog_index()
        df.loc[ind, 'ari_index'] = calc.get_ari_index()
        df.loc[ind, 'lix_index'] = calc.get_lix_index()
        df.loc[ind, 'dale_chall_score'] = calc.get_dale_chall_score()
        df.loc[
            ind,
            'dale_chall_known_fraction'] = calc.get_dale_chall_known_fraction(
            )
    return df
Beispiel #6
0
    def score(eula):
        text = eula.text
        name = 'Plain Language'
        description = 'Checks the reading level of the EULA (K-12)'
        grade = 'NR'
        max = 4

        calc = readcalc.ReadCalc(text)
        rl = calc.get_flesch_kincaid_grade_level()

        feedback = []

        if rl < 8:
            score = 4
            grade = 'A'
            feedback.append({
                'rating':
                2,
                'text':
                "Your EULA has a reading level of {0:.0f}".format(rl)
            })
        elif rl < 10:
            score = 3
            grade = 'B'
            feedback.append({
                'rating':
                2,
                'text':
                "Your EULA has a reading level of {0:.0f}".format(rl)
            })
        elif rl < 12:
            score = 2
            grade = 'C'
            feedback.append({
                'rating':
                1,
                'text':
                "Your EULA has a reading level of {0:.0f}".format(rl)
            })
        else:
            score = 0
            grade = 'F'
            feedback.append({
                'rating':
                0,
                'text':
                "Your EULA has a reading level of {0:.0f}".format(rl)
            })

        feedback.append({
            'rating':
            3,
            'text':
            'The average American has a reading level of grade 8'
        })

        return {
            'name': name,
            'description': description,
            'grade': grade,
            'score': score,
            'max': 4,
            'feedback': feedback
        }
def calc_quality_measures(review):
    num_chars, num_words, num_sentences, punct_dense, capital_dense, space_dense, num_misspellings, portion_misspell, \
    avg_num_syllable, word_len_entropy, gunning_fog, flesch_kincaid, smog, pos_entropy = \
        0, 0, 0, 0.0, 0.0, 0.0, 0, 0.0, None, None, None, None, None, None

    if len(review) > 0:
        try:
            tokenized_review = word_tokenize(review)
            # Number of characters
            num_chars = len(review)
            # Punctuation
            num_punct = len([ch for ch in review if ch in punctuation])
            punct_dense = num_punct / num_chars
            # Capitalization
            num_capital = len([ch for ch in review if ch.isupper()])
            capital_dense = num_capital / num_chars
            start_with_capital = review[0].isupper()
            # Space Density (percent of all characters)
            num_space = len([ch for ch in review if ch == ' '])
            space_dense = num_space / num_chars
            # Number of out-of-vocabulary words
            '''
            To identify out-of-vocabulary words, we construct multiple lists of the k most frequent words in Yahoo! Answers, with 
            several k values ranging between 50 and 5000. These lists are then used to calculate a set of “out-of-vocabulary” 
            features, where each feature assumes the list of top-k words for some k is the vocabulary. An example feature created 
            this way is “the fraction of words in an answer that do not appear in the top-1000 words of the collection
            '''
            # Average number of syllables per word
            avg_num_syllable = np.mean(
                [syllable_count(word) for word in tokenized_review])
            # entropy of word lengths
            word_len_entropy = entropy(
                [len(word) for word in tokenized_review])
            # Word length
            num_words = len(tokenized_review)
            # Num sentences
            num_sentences = len(sent_tokenize(review))
            # Misspellings and typos - number of spelling mistakes
            misspelled = spell.unknown(tokenized_review)
            num_misspellings = len(misspelled)
            portion_misspell = num_misspellings / num_words

            # Readability:
            calc_readability = readcalc.ReadCalc(review)
            # Gunning Fog Index (6-17) 17-difficult, 6-easy
            gunning_fog = calc_readability.get_gunning_fog_index()
            # gunning_fog = textstat.gunning_fog(review)
            # Flesch Kincaid Formula (0-100) 0-difficult, 100-easy
            flesch_kincaid = calc_readability.get_flesch_kincaid_grade_level()
            # flesch_kincaid = textstat.flesch_kincaid_grade(review)
            # SMOG Grading - need at least 30 sentences
            smog = calc_readability.get_smog_index()
            # smog = textstat.smog_index(review)

            # POS - %Nouns, %Verbs
            pos_tags = [item[1] for item in pos_tag(tokenized_review)]
            # Entropy of the part-of-speech tags
            pos_count = list(Counter(pos_tags).values())
            pos_dist = np.array(pos_count) / sum(pos_count)
            pos_entropy = entropy(pos_dist)
            # Formality score - between 0 and 100%, 0 - completely contextualizes language,
            # completely formal language - 100
            # noun_freq = len([pos for pos in pos_tags if pos[:2] == 'NN']) / len(tokenized_review)
            # adjective_freq = len([pos for pos in pos_tags if pos[:2] == 'JJ']) / len(tokenized_review)
            # preposition_freq = len([pos for pos in pos_tags if pos[:2] == 'IN']) / len(tokenized_review)
            # article_freq = len([pos for pos in pos_tags if pos[:2] == 'DT']) / len(tokenized_review)
            # pronoun_freq = len([pos for pos in pos_tags if pos[:2] == 'PR']) / len(tokenized_review)
            # verb_freq = len([pos for pos in pos_tags if pos[:2] == 'VB']) / len(tokenized_review)
            # adverb_freq = len([pos for pos in pos_tags if pos[:2] == 'RB']) / len(tokenized_review)
            # interjection_freq = len([pos for pos in pos_tags if pos[:2] == 'UH']) / len(tokenized_review)
            # formality_score = (noun_freq + adjective_freq + preposition_freq + article_freq -
            #                    pronoun_freq - verb_freq - adverb_freq - interjection_freq + 100) / 2

        except Exception as e:
            print('Exception: ' + str(e))
            print('Review: ' + str(review))

    return num_chars, num_words, num_sentences, punct_dense, capital_dense, space_dense, num_misspellings, \
           portion_misspell, avg_num_syllable, word_len_entropy, gunning_fog, flesch_kincaid, smog, pos_entropy
pd.options.mode.chained_assignment = None

df = pd.read_csv("../prediction_app/static/merged_data.csv")
print "done reading csv"

essay_df = df[['_projectid', 'RESP', ' essay']]
essay_df['new_essay'] = essay_df[' essay'].map(lambda x: type(x))
essay_df = essay_df[essay_df.new_essay == str]
print "done throwing out floats"
print "percent remaining", len(essay_df) / len(df)
essay_df.new_essay = essay_df[' essay'].map(lambda x: x.decode('utf-8'))
print "done decoding"

essay_df['ari'] = essay_df['new_essay'].map(
    lambda x: readcalc.ReadCalc(x).get_ari_index())
print "done ari"
essay_df['coleman'] = essay_df['new_essay'].map(
    lambda x: readcalc.ReadCalc(x).get_coleman_liau_index())
print "done coleman"
essay_df['flesch_grade'] = essay_df['new_essay'].map(
    lambda x: readcalc.ReadCalc(x).get_flesch_kincaid_grade_level())
print "done flesch grade"
essay_df['flesch_ease'] = essay_df['new_essay'].map(
    lambda x: readcalc.ReadCalc(x).get_flesch_reading_ease())
print "done flesch ease"
essay_df['dale'] = essay_df['new_essay'].map(
    lambda x: readcalc.ReadCalc(x).get_dale_chall_score())
print "done dale"
essay_df['gunning'] = essay_df['new_essay'].map(
    lambda x: readcalc.ReadCalc(x).get_gunning_fog_index())
Beispiel #9
0
##Read readcalc
from readcalc import readcalc
from nltk.tokenize import RegexpTokenizer  # Regex handler
import re
#calc_1 = readcalc.ReadCalc("A ladder.")
#calc = readcalc.ReadCalc("""In the mid-1970’s, Walter Alvarez, a geologist, was studying Earth’s polarity. It had recently been learned that the orientation of the planet’s magnetic field reverses, so that every so often, in effect, south becomes north and vice versa. Alvarez and some colleagues had found that a certain formation of pinkish limestone in Italy, known as the scaglia rossa, recorded these occasional reversals. The limestone also contained the fossilized remains of millions of tiny sea creatures called foraminifera. Alvarez became interested in a thin layer of clay in the limestone that seemed to have been laid down around the end of the Cretaceous Period. Below the layer, certain species of foraminifera—or forams, for short—were preserved. In the clay layer, there were no forams. Above the layer, the earlier species disappeared and new forams appeared. Having been taught the uniformitarian view, which held that any apparent extinctions throughout geological time resulted from the incompleteness of the fossil record’ rather than an actual extinction, Alvarez was not sure what to make of the lacuna in geological time corresponding to the missing foraminifera, because the change looked very abrupt""")
text_4 = "Living things adapt to their environment so they  can  survive . An organism  adapts  when it develops  a  behavior  that makes it more likely to survive. It can  behavior  that makes it more likely to survive. It can  behavior also adapt by forming a physical characteristic or body  part that helps it survive. In a forest biome, some trees grow taller than the  other plants around them. This lets them reach the  sunlight. Growing taller is an adaptation that helps  trees survive. Shorter plants have adapted with their  behavior. They have learned to live in the shade with  less sunlight.  Animals in the forest have a wide variety of  adaptations. Monkeys have long tails. They can use  them almost like another hand. This helps them swing  quickly through the tops of trees. They can even  do this while holding their babies or gathering food.  Giraffes need to reach leaves at the tops of tall trees.  Having a long neck is an adaptation that allows them  to do this. Some animals adaptations prevent other animals  from wanting to eat them. A skunk’s horrible smell  makes larger animals choose something else to eat.  Even plants sometimes protect themselves in this  way. Roses and acacia trees both have dangerous  thorns. The thorns prevent animals from eating  their leaves."

text_3 = """What Australian mammal can leap 25 feet in one hop and move for short periods at 35 miles an hour? The red kangaroo. A full grown male stands as tall as a six foot person and weighs 200 pounds. This is slightly bigger than the grey kangaroo, making it the world’s largest marsupial. What’s a marsupial? A mammal where the mother has a pouch for carrying, feeding and protecting her young. While a red kangaroo may be the largest marsupial, the newborn baby is tiny, under an inch long. After a few months of sleeping, nursing and growing in mom’s stomach pouch the young kangaroo (joey) begins to come out. But it hurries back to the pouch fast when frightened, hungry or cold. Eventually, the joey gets so big it hangs out of the pouch. Then, at eight months old, it stays out. But the joey remains close to mom until ready to live on its own. Red kangaroos are good swimmers. However, they are best known for their hopping abilities. Their long, powerful hind legs have big feet. Hopping moves them quickly over their grassy, shrubby and desert habitats. Meanwhile, a thick tail helps them balance and steer. What do red kangaroos eat? Grass, leaves and other vegetation. And guess what - they often regurgitate food and chew their cud just like a cow. The red kangaroo’s vegetarian diet provides much of its water. It can also go long periods without drinking. Staying in the shade, panting and limiting most activity to nighttime helps the red kangaroo conserve water and stay cool. Red kangaroos travel together in groups called mobs. Mobs include both males and females, with one male being dominant. Males show their dominance by “boxing” with other males. They balance on their tails and try pushing each other off balance with their forearms or by kicking their hind legs. This kicking ability, along with their sharp claws, can also be used by kangaroos to defend against Australia’s wild dog, the dingo. """
#text_3 = "See spot run very quickly"
#text_3_clean = re.sub("([’]\w)", "", text_3)

#text_3_clean = re.sub("(\d+\s)", "", text_3_clean)

calc_3 = readcalc.ReadCalc(preprocesshtml='justext', text=text_3)

#text = text_3.lower()
#tokenizer = RegexpTokenizer(r'\w+')
#tokens = tokenizer.tokenize(text)
#tokens
calc_2 = readcalc.ReadCalc("The man is tall. He has a house.")


class text:
    def __init__(self, calc):
        self.num_sentences = len(calc.get_sentences())
        self.num_words = len(calc.get_words())

        def __get_number_chars(words):
            """
                Returns the total number of chars in the text.
Beispiel #10
0
wb = Workbook()
sheet = wb.active
wbin = load_workbook(filename="yale_tweets.xlsx")
wbinsh = wbin.active
sheet["A1"] = "Flesh Reading Ease"
sheet["B1"] = "Flesh Kincaid Grade Level"
sheet["C1"] = "Coleman Liau Index"
sheet["D1"] = "Gunning Fog Index"
sheet["E1"] = "SMOG Index"
sheet["F1"] = "ARI Index"
sheet["G1"] = "LIX Index"
sheet["H1"] = "Dale-Chall Score"
sheet["I1"] = "TTR Simple"


for i in range(1,144332):
    if len(str(wbinsh["K"+str(i)].value).split(" "))> 15:
        calc = readcalc.ReadCalc(wbinsh["K"+str(i)].value)
        tokenized = word_tokenize(str(wbinsh["K"+str(i)].value))
        sheet["A"+str(i+1)] = calc.get_flesch_reading_ease()
        sheet["B"+str(i+1)] = calc.get_flesch_kincaid_grade_level()
        sheet["C"+str(i+1)] = calc.get_coleman_liau_index()
        sheet["D"+str(i+1)] = calc.get_gunning_fog_index()
        sheet["E"+str(i+1)] = calc.get_smog_index()
        sheet["F"+str(i+1)] = calc.get_ari_index()
        sheet["G"+str(i+1)] = calc.get_lix_index()
        sheet["H"+str(i+1)] = calc.get_dale_chall_score()
        sheet["I"+str(i+1)] = ld.ttr(tokenized)

wb.save(filename="yale_scores.xlsx")
Beispiel #11
0
for i in range(0, len(Textos)):
    nstopwords.append(nSW(Textos[i], setSW))

#print(len(nstopwords))
#print(nstopwords);exit(1)

nsyllable = []
for i in range(0, len(Textos)):
    nsyllable.append(textstat.syllable_count(Textos[i], lang='pt_BR'))

#print(len(nsyllable))
#print(nsyllable)

SMOGindex = []
for i in range(0, len(Textos)):
    s = readcalc.ReadCalc(Textos[i])
    SMOGindex.append(int(s.get_smog_index()))

#print(len(SMOGindex))
#print(SMOGindex);exit(1)

LD = []
for i in range(0, len(Textos)):
    LD.append(lexical_diversity(Textos[i]))

LD = [int(1000 * x) for x in LD]

#print(len(LD))
#print(LD);exit(1)

#type-token ratio
Beispiel #12
0
def fogIndex(text):
    return readcalc.ReadCalc(text).get_gunning_fog_index()