def calcFeatures(params): index, rev = params # Multiprocessing... global rev_xl filename = "insert data path of the 2015 data from https://figshare.com/articles/English_Wikipedia_Quality_Asssessment_Dataset/1375406" + str( rev['revid']) if (os.path.exists(filename)): print(rev['revid']) text = util.read_file(filename) text = util.cleanhtml(text) text = text.replace('\'\'\'', '') assert rev['pageid'] == rev_xl.iloc[index, 0] print("matched ", rev['revid']) calc = readcalc.ReadCalc(text) textual_score = list(calc.get_all_metrics()) text_stat = textstatistics() linsear_write_formula = round(text_stat.linsear_write_formula(text),2) textual_score.append(linsear_write_formula) grammar_score = len(tool.check(text)) textual_score.append(grammar_score) rev_xl.iloc[index, 14:36] = textual_score print(rev_xl.iloc[index, :]) if index % 10 == 0: rev_xl.to_csv(path)
def grade_essay(): data = flask.request.json calc = readcalc.ReadCalc(data["essay"]) ari_score = round(calc.get_ari_index(),2) polarity = TextBlob(data["essay"]).polarity polarity = round(polarity*10,1) results = {"readability": ari_score, "polarity": polarity } return flask.jsonify(results)
def score(eula): name = 'Document Length' text = eula.text calc = readcalc.ReadCalc(text) words = len(calc.get_words()) description = 'Counts the number of words in the EULA' feedback = [{ 'rating': 3, 'text': 'This EULA is {0} words long'.format(words) }] if words < 1200: score = 4 grade = 'A' feedback.append({ 'rating': 2, 'text': 'This EULA is short and succinct' }) elif words < 1700: score = 3 grade = 'B' feedback.append({ 'rating': 1, 'text': 'This EULA is relatively concise' }) elif words < 2000: score = 2 grade = 'C' feedback.append({'rating': 1, 'text': 'This EULA is a bit long'}) elif words < 2500: score = 1 grade = 'D' feedback.append({'rating': 0, 'text': 'This EULA is very long'}) else: score = 0 grade = 'F' feedback.append({'rating': 0, 'text': 'This EULA is too long'}) #The tool will assign a grade of F for EULAs exceeding 2,500 words, D for 2,000 to 2,499 words, # C for 1,700 to 1,999 words, B for 1,200 to 1,699 words, and A for fewer than 1,200 words. return { 'name': name, 'grade': grade, 'description': description, 'feedback': feedback, 'score': score, 'max': 4, 'numwords': words }
def score(): data = flask.request.json x = np.matrix(data["example"]) score = PREDICTOR.predict_proba(x) a = data["example"][9:37] b = data["example"][41:45] c = data["example"][48:54] short_x = np.concatenate((a,b,c)) short_x = tuple(short_x) ind = pd.DataFrame(completed_df[' essay'].loc[completed_df['features_together'] == short_x]) text = ind[' essay'].values[-1] calc = readcalc.ReadCalc(text) polarity = TextBlob(text).polarity polarity = round(polarity*10,1) user_readability = round(calc.get_ari_index(),1) results = {"score": score[0][1], "project_text": text, "user_readability": user_readability, "neighbor_polarity": polarity } return flask.jsonify(results)
def compute_readability(df, col): '''computes the readability measures of text input: df = inpute dataframe col = column of inpute dataframe for which the readability scores will bee calculated''' for ind, row in df.iterrows(): calc = readcalc.ReadCalc(row[col]) df.loc[ind, 'smog_index'] = calc.get_smog_index() df.loc[ind, 'flesch_reading_ease'] = calc.get_flesch_reading_ease() df.loc[ ind, 'flesch_kincaid_grade_level'] = calc.get_flesch_kincaid_grade_level( ) df.loc[ind, 'coleman_liau_index'] = calc.get_coleman_liau_index() df.loc[ind, 'gunning_fog_index'] = calc.get_gunning_fog_index() df.loc[ind, 'ari_index'] = calc.get_ari_index() df.loc[ind, 'lix_index'] = calc.get_lix_index() df.loc[ind, 'dale_chall_score'] = calc.get_dale_chall_score() df.loc[ ind, 'dale_chall_known_fraction'] = calc.get_dale_chall_known_fraction( ) return df
def score(eula): text = eula.text name = 'Plain Language' description = 'Checks the reading level of the EULA (K-12)' grade = 'NR' max = 4 calc = readcalc.ReadCalc(text) rl = calc.get_flesch_kincaid_grade_level() feedback = [] if rl < 8: score = 4 grade = 'A' feedback.append({ 'rating': 2, 'text': "Your EULA has a reading level of {0:.0f}".format(rl) }) elif rl < 10: score = 3 grade = 'B' feedback.append({ 'rating': 2, 'text': "Your EULA has a reading level of {0:.0f}".format(rl) }) elif rl < 12: score = 2 grade = 'C' feedback.append({ 'rating': 1, 'text': "Your EULA has a reading level of {0:.0f}".format(rl) }) else: score = 0 grade = 'F' feedback.append({ 'rating': 0, 'text': "Your EULA has a reading level of {0:.0f}".format(rl) }) feedback.append({ 'rating': 3, 'text': 'The average American has a reading level of grade 8' }) return { 'name': name, 'description': description, 'grade': grade, 'score': score, 'max': 4, 'feedback': feedback }
def calc_quality_measures(review): num_chars, num_words, num_sentences, punct_dense, capital_dense, space_dense, num_misspellings, portion_misspell, \ avg_num_syllable, word_len_entropy, gunning_fog, flesch_kincaid, smog, pos_entropy = \ 0, 0, 0, 0.0, 0.0, 0.0, 0, 0.0, None, None, None, None, None, None if len(review) > 0: try: tokenized_review = word_tokenize(review) # Number of characters num_chars = len(review) # Punctuation num_punct = len([ch for ch in review if ch in punctuation]) punct_dense = num_punct / num_chars # Capitalization num_capital = len([ch for ch in review if ch.isupper()]) capital_dense = num_capital / num_chars start_with_capital = review[0].isupper() # Space Density (percent of all characters) num_space = len([ch for ch in review if ch == ' ']) space_dense = num_space / num_chars # Number of out-of-vocabulary words ''' To identify out-of-vocabulary words, we construct multiple lists of the k most frequent words in Yahoo! Answers, with several k values ranging between 50 and 5000. These lists are then used to calculate a set of “out-of-vocabulary” features, where each feature assumes the list of top-k words for some k is the vocabulary. An example feature created this way is “the fraction of words in an answer that do not appear in the top-1000 words of the collection ''' # Average number of syllables per word avg_num_syllable = np.mean( [syllable_count(word) for word in tokenized_review]) # entropy of word lengths word_len_entropy = entropy( [len(word) for word in tokenized_review]) # Word length num_words = len(tokenized_review) # Num sentences num_sentences = len(sent_tokenize(review)) # Misspellings and typos - number of spelling mistakes misspelled = spell.unknown(tokenized_review) num_misspellings = len(misspelled) portion_misspell = num_misspellings / num_words # Readability: calc_readability = readcalc.ReadCalc(review) # Gunning Fog Index (6-17) 17-difficult, 6-easy gunning_fog = calc_readability.get_gunning_fog_index() # gunning_fog = textstat.gunning_fog(review) # Flesch Kincaid Formula (0-100) 0-difficult, 100-easy flesch_kincaid = calc_readability.get_flesch_kincaid_grade_level() # flesch_kincaid = textstat.flesch_kincaid_grade(review) # SMOG Grading - need at least 30 sentences smog = calc_readability.get_smog_index() # smog = textstat.smog_index(review) # POS - %Nouns, %Verbs pos_tags = [item[1] for item in pos_tag(tokenized_review)] # Entropy of the part-of-speech tags pos_count = list(Counter(pos_tags).values()) pos_dist = np.array(pos_count) / sum(pos_count) pos_entropy = entropy(pos_dist) # Formality score - between 0 and 100%, 0 - completely contextualizes language, # completely formal language - 100 # noun_freq = len([pos for pos in pos_tags if pos[:2] == 'NN']) / len(tokenized_review) # adjective_freq = len([pos for pos in pos_tags if pos[:2] == 'JJ']) / len(tokenized_review) # preposition_freq = len([pos for pos in pos_tags if pos[:2] == 'IN']) / len(tokenized_review) # article_freq = len([pos for pos in pos_tags if pos[:2] == 'DT']) / len(tokenized_review) # pronoun_freq = len([pos for pos in pos_tags if pos[:2] == 'PR']) / len(tokenized_review) # verb_freq = len([pos for pos in pos_tags if pos[:2] == 'VB']) / len(tokenized_review) # adverb_freq = len([pos for pos in pos_tags if pos[:2] == 'RB']) / len(tokenized_review) # interjection_freq = len([pos for pos in pos_tags if pos[:2] == 'UH']) / len(tokenized_review) # formality_score = (noun_freq + adjective_freq + preposition_freq + article_freq - # pronoun_freq - verb_freq - adverb_freq - interjection_freq + 100) / 2 except Exception as e: print('Exception: ' + str(e)) print('Review: ' + str(review)) return num_chars, num_words, num_sentences, punct_dense, capital_dense, space_dense, num_misspellings, \ portion_misspell, avg_num_syllable, word_len_entropy, gunning_fog, flesch_kincaid, smog, pos_entropy
pd.options.mode.chained_assignment = None df = pd.read_csv("../prediction_app/static/merged_data.csv") print "done reading csv" essay_df = df[['_projectid', 'RESP', ' essay']] essay_df['new_essay'] = essay_df[' essay'].map(lambda x: type(x)) essay_df = essay_df[essay_df.new_essay == str] print "done throwing out floats" print "percent remaining", len(essay_df) / len(df) essay_df.new_essay = essay_df[' essay'].map(lambda x: x.decode('utf-8')) print "done decoding" essay_df['ari'] = essay_df['new_essay'].map( lambda x: readcalc.ReadCalc(x).get_ari_index()) print "done ari" essay_df['coleman'] = essay_df['new_essay'].map( lambda x: readcalc.ReadCalc(x).get_coleman_liau_index()) print "done coleman" essay_df['flesch_grade'] = essay_df['new_essay'].map( lambda x: readcalc.ReadCalc(x).get_flesch_kincaid_grade_level()) print "done flesch grade" essay_df['flesch_ease'] = essay_df['new_essay'].map( lambda x: readcalc.ReadCalc(x).get_flesch_reading_ease()) print "done flesch ease" essay_df['dale'] = essay_df['new_essay'].map( lambda x: readcalc.ReadCalc(x).get_dale_chall_score()) print "done dale" essay_df['gunning'] = essay_df['new_essay'].map( lambda x: readcalc.ReadCalc(x).get_gunning_fog_index())
##Read readcalc from readcalc import readcalc from nltk.tokenize import RegexpTokenizer # Regex handler import re #calc_1 = readcalc.ReadCalc("A ladder.") #calc = readcalc.ReadCalc("""In the mid-1970’s, Walter Alvarez, a geologist, was studying Earth’s polarity. It had recently been learned that the orientation of the planet’s magnetic field reverses, so that every so often, in effect, south becomes north and vice versa. Alvarez and some colleagues had found that a certain formation of pinkish limestone in Italy, known as the scaglia rossa, recorded these occasional reversals. The limestone also contained the fossilized remains of millions of tiny sea creatures called foraminifera. Alvarez became interested in a thin layer of clay in the limestone that seemed to have been laid down around the end of the Cretaceous Period. Below the layer, certain species of foraminifera—or forams, for short—were preserved. In the clay layer, there were no forams. Above the layer, the earlier species disappeared and new forams appeared. Having been taught the uniformitarian view, which held that any apparent extinctions throughout geological time resulted from the incompleteness of the fossil record’ rather than an actual extinction, Alvarez was not sure what to make of the lacuna in geological time corresponding to the missing foraminifera, because the change looked very abrupt""") text_4 = "Living things adapt to their environment so they can survive . An organism adapts when it develops a behavior that makes it more likely to survive. It can behavior that makes it more likely to survive. It can behavior also adapt by forming a physical characteristic or body part that helps it survive. In a forest biome, some trees grow taller than the other plants around them. This lets them reach the sunlight. Growing taller is an adaptation that helps trees survive. Shorter plants have adapted with their behavior. They have learned to live in the shade with less sunlight. Animals in the forest have a wide variety of adaptations. Monkeys have long tails. They can use them almost like another hand. This helps them swing quickly through the tops of trees. They can even do this while holding their babies or gathering food. Giraffes need to reach leaves at the tops of tall trees. Having a long neck is an adaptation that allows them to do this. Some animals adaptations prevent other animals from wanting to eat them. A skunk’s horrible smell makes larger animals choose something else to eat. Even plants sometimes protect themselves in this way. Roses and acacia trees both have dangerous thorns. The thorns prevent animals from eating their leaves." text_3 = """What Australian mammal can leap 25 feet in one hop and move for short periods at 35 miles an hour? The red kangaroo. A full grown male stands as tall as a six foot person and weighs 200 pounds. This is slightly bigger than the grey kangaroo, making it the world’s largest marsupial. What’s a marsupial? A mammal where the mother has a pouch for carrying, feeding and protecting her young. While a red kangaroo may be the largest marsupial, the newborn baby is tiny, under an inch long. After a few months of sleeping, nursing and growing in mom’s stomach pouch the young kangaroo (joey) begins to come out. But it hurries back to the pouch fast when frightened, hungry or cold. Eventually, the joey gets so big it hangs out of the pouch. Then, at eight months old, it stays out. But the joey remains close to mom until ready to live on its own. Red kangaroos are good swimmers. However, they are best known for their hopping abilities. Their long, powerful hind legs have big feet. Hopping moves them quickly over their grassy, shrubby and desert habitats. Meanwhile, a thick tail helps them balance and steer. What do red kangaroos eat? Grass, leaves and other vegetation. And guess what - they often regurgitate food and chew their cud just like a cow. The red kangaroo’s vegetarian diet provides much of its water. It can also go long periods without drinking. Staying in the shade, panting and limiting most activity to nighttime helps the red kangaroo conserve water and stay cool. Red kangaroos travel together in groups called mobs. Mobs include both males and females, with one male being dominant. Males show their dominance by “boxing” with other males. They balance on their tails and try pushing each other off balance with their forearms or by kicking their hind legs. This kicking ability, along with their sharp claws, can also be used by kangaroos to defend against Australia’s wild dog, the dingo. """ #text_3 = "See spot run very quickly" #text_3_clean = re.sub("([’]\w)", "", text_3) #text_3_clean = re.sub("(\d+\s)", "", text_3_clean) calc_3 = readcalc.ReadCalc(preprocesshtml='justext', text=text_3) #text = text_3.lower() #tokenizer = RegexpTokenizer(r'\w+') #tokens = tokenizer.tokenize(text) #tokens calc_2 = readcalc.ReadCalc("The man is tall. He has a house.") class text: def __init__(self, calc): self.num_sentences = len(calc.get_sentences()) self.num_words = len(calc.get_words()) def __get_number_chars(words): """ Returns the total number of chars in the text.
wb = Workbook() sheet = wb.active wbin = load_workbook(filename="yale_tweets.xlsx") wbinsh = wbin.active sheet["A1"] = "Flesh Reading Ease" sheet["B1"] = "Flesh Kincaid Grade Level" sheet["C1"] = "Coleman Liau Index" sheet["D1"] = "Gunning Fog Index" sheet["E1"] = "SMOG Index" sheet["F1"] = "ARI Index" sheet["G1"] = "LIX Index" sheet["H1"] = "Dale-Chall Score" sheet["I1"] = "TTR Simple" for i in range(1,144332): if len(str(wbinsh["K"+str(i)].value).split(" "))> 15: calc = readcalc.ReadCalc(wbinsh["K"+str(i)].value) tokenized = word_tokenize(str(wbinsh["K"+str(i)].value)) sheet["A"+str(i+1)] = calc.get_flesch_reading_ease() sheet["B"+str(i+1)] = calc.get_flesch_kincaid_grade_level() sheet["C"+str(i+1)] = calc.get_coleman_liau_index() sheet["D"+str(i+1)] = calc.get_gunning_fog_index() sheet["E"+str(i+1)] = calc.get_smog_index() sheet["F"+str(i+1)] = calc.get_ari_index() sheet["G"+str(i+1)] = calc.get_lix_index() sheet["H"+str(i+1)] = calc.get_dale_chall_score() sheet["I"+str(i+1)] = ld.ttr(tokenized) wb.save(filename="yale_scores.xlsx")
for i in range(0, len(Textos)): nstopwords.append(nSW(Textos[i], setSW)) #print(len(nstopwords)) #print(nstopwords);exit(1) nsyllable = [] for i in range(0, len(Textos)): nsyllable.append(textstat.syllable_count(Textos[i], lang='pt_BR')) #print(len(nsyllable)) #print(nsyllable) SMOGindex = [] for i in range(0, len(Textos)): s = readcalc.ReadCalc(Textos[i]) SMOGindex.append(int(s.get_smog_index())) #print(len(SMOGindex)) #print(SMOGindex);exit(1) LD = [] for i in range(0, len(Textos)): LD.append(lexical_diversity(Textos[i])) LD = [int(1000 * x) for x in LD] #print(len(LD)) #print(LD);exit(1) #type-token ratio
def fogIndex(text): return readcalc.ReadCalc(text).get_gunning_fog_index()