def download(resource, article): article_text = download_article(article['url']) if article['title'] != None and article_text != None: print(article['url']) print(textstat.automated_readability_index(article_text)) norm_score = int( textstat.automated_readability_index(article_text) / 15 * 100) vote = 0 req = requests.get('http://localhost:3000/posts?title=' + article['title']) if len(req.json()) is 0: res = requests.post('http://localhost:3000/posts', data={ 'title': article['title'], 'vote': 0 }) else: vote = req.json()[0]['vote'] if norm_score < 65: easiness = "easy" s = 1 elif norm_score < 75: easiness = "easy_medium" s = 2 elif norm_score < 85: easiness = 'medium' s = 3 elif norm_score < 95: easiness = "medium_diff" s = 4 else: easiness = "difficult" s = 5 date = article['publishedAt'].split('T')[0].replace(" ", "") if article['urlToImage'] != '': return { 'score': "X " + str(s), 'resource': resource.upper().replace("-", " ").replace("THE", ""), 'title': article['title'], 'url': article['url'], 'img': article['urlToImage'], 'snippet': article['description'], 'easiness': easiness, 'date': date, 'vote': vote }
def text_analytics(text): if textstat.sentence_count(text) != 0: lexicon = textstat.lexicon_count(text) #word count sent = textstat.sentence_count(text) #sentence count syll = textstat.syllable_count(text) #syllable count flesch = textstat.flesch_reading_ease(text) #flesch score smog = textstat.smog_index(text) #SMOG index fog = textstat.gunning_fog(text) #FOG index dale = textstat.dale_chall_readability_score(text) #grade level ari = textstat.automated_readability_index(text) #grade level cl = textstat.coleman_liau_index(text) #grade level flesch1 = lexicon*flesch flesch2 = sent*flesch flesch3 = syll*flesch smog1 = lexicon*smog smog2 = sent*smog smog3 = syll*smog fog1 = lexicon*fog fog2 = sent*fog fog3 = syll*fog dale1 = lexicon*dale dale2 = sent*dale dale3=syll*dale ari1 = lexicon*ari ari2 = sent*ari ari3 = syll*ari cl1 = lexicon*cl cl2 = sent*cl cl3 = syll*cl x=[lexicon,sent,syll,flesch,smog,fog,dale,ari,cl,flesch1,flesch2,flesch3,smog1, smog2,smog3,fog1,fog2,fog3,dale1,dale2,dale3,ari1,ari2,ari3,cl1,cl2,cl3] return(x)
def get_readability(df2): df = df2.copy() text_feats = df.select_dtypes(include=['object']).columns.values for i, col in enumerate(text_feats): df['flesch_reading_ease{}'.format(i)] = df[col].apply( lambda x: textstat.flesch_reading_ease(x)) df['smog_index{}'.format(i)] = df[col].apply( lambda x: textstat.smog_index(x)) df['flesch_kincaid_grade{}'.format(i)] = df[col].apply( lambda x: textstat.flesch_kincaid_grade(x)) df['coleman_liau_index{}'.format(i)] = df[col].apply( lambda x: textstat.coleman_liau_index(x)) df['automated_readability_index{}'.format(i)] = df[col].apply( lambda x: textstat.automated_readability_index(x)) df['dale_chall_readability_score{}'.format(i)] = df[col].apply( lambda x: textstat.dale_chall_readability_score(x)) df['difficult_words{}'.format(i)] = df[col].apply( lambda x: textstat.difficult_words(x)) df['linsear_write_formula{}'.format(i)] = df[col].apply( lambda x: textstat.linsear_write_formula(x)) df['gunning_fog{}'.format(i)] = df[col].apply( lambda x: textstat.gunning_fog(x)) df['text_standard{}'.format(i)] = df[col].apply( lambda x: textstat.text_standard(x)) return df
def readability(text): print("Readability\n=================================\n\n") print("Flesch Reading Ease\n________________________\n\n") print str(textstat.flesch_reading_ease(text)) + "\n" print("Smog Index\n________________________\n\n") print str(textstat.smog_index(text)) + "\n" print("Flesch Kincaid Grade\n________________________\n\n") print str(textstat.flesch_kincaid_grade(text)) + "\n" print("Coleman Liau Index\n________________________\n\n") print str(textstat.coleman_liau_index(text)) + "\n" print("ARI\n________________________\n\n") print str(textstat.automated_readability_index(text)) + "\n" print("Dale Chall\n________________________\n\n") print str(textstat.dale_chall_readability_score(text)) + "\n" print("Difficult Words\n________________________\n\n") print str(textstat.difficult_words(text)) + "\n" print("Linsear Write Formula\n________________________\n\n") print str(textstat.linsear_write_formula(text)) + "\n" print("Gunning Fog\n________________________\n\n") print str(textstat.gunning_fog(text)) + "\n" print "Compiled Score\n_____________________________\n\n" print str(textstat.text_standard(text)) + "\n" return len(adjectives)
def _get_reading_stats(no_code_text): """ Returns reading level information :param no_code_text: String to analyse :return: list of details """ group_by = 'Reading Level Analysis ' results = [] results.append(TextFeature('Flesch Reading Ease', textstat.flesch_reading_ease(no_code_text), group_by)) # higher is better, scale 0 to 100 results.append(TextFeature('Flesch-Kincaid Grade Level', textstat.flesch_kincaid_grade(no_code_text), group_by)) try: results.append(TextFeature('The Fog Scale (Gunning FOG formula)', textstat.gunning_fog(no_code_text), group_by)) except IndexError: # Not sure why, but this test throws this error sometimes results.append(TextFeature('The Fog Scale (Gunning FOG formula)', "Undetermined", group_by)) try: results.append(TextFeature('The SMOG Index', textstat.smog_index(no_code_text), group_by)) except IndexError: # Not sure why, but this test throws this error sometimes results.append(TextFeature('The SMOG Index', "Undetermined", group_by)) results.append(TextFeature('Automated Readability Index', textstat.automated_readability_index(no_code_text), group_by)) results.append(TextFeature('The Coleman-Liau Index', textstat.coleman_liau_index(no_code_text), group_by)) try: results.append(TextFeature('Linsear Write Formula', textstat.linsear_write_formula(no_code_text), group_by)) except IndexError: results.append(TextFeature('Linsear Write Formula', "Undetermined", group_by)) try: results.append(TextFeature('Dale Chall Readability Score', textstat.dale_chall_readability_score(no_code_text), group_by)) except IndexError: # Not sure why, but this test throws this error sometimes results.append(TextFeature('Dale Chall Readability Score', "Undetermined", group_by)) try: results.append(TextFeature('Readability Consensus', textstat.readability_consensus(no_code_text), group_by)) except (TypeError, IndexError): results.append(TextFeature('Readability Consensus', "Undetermined; One of the tests above failed.", group_by)) return results
def do_text_stats(self, text): ### Syllable Count syllable_count = textstat.syllable_count(text) ### Lexicon Count lexicon_count = textstat.lexicon_count(text, True) ### Sentence Count sentence_count = textstat.sentence_count(text) ### The Flesch Reading Ease formula try: flesch_reading_ease = textstat.flesch_reading_ease(text) except TypeError as e: flesch_reading_ease = None #* 90-100 : Very Easy #* 80-89 : Easy #* 70-79 : Fairly Easy #* 60-69 : Standard #* 50-59 : Fairly Difficult #* 30-49 : Difficult #* 0-29 : Very Confusing ### The The Flesch-Kincaid Grade Level try: flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) except TypeError as e: flesch_kincaid_grade = None ## The Fog Scale (Gunning FOG Formula) gunning_fog = textstat.gunning_fog(text) ### The SMOG Index smog_index = textstat.smog_index(text) ### Automated Readability Index automated_readability_index = textstat.automated_readability_index( text) ### The Coleman-Liau Index try: coleman_liau_index = textstat.coleman_liau_index(text) except TypeError as e: coleman_liau_index = None ### Linsear Write Formula linsear_write_formula = textstat.linsear_write_formula(text) ### Dale-Chall Readability Score dale_chall_readability_score = textstat.dale_chall_readability_score( text) ### Readability Consensus based upon all the above tests try: text_standard = textstat.text_standard(text) except TypeError as e: text_standard = None return { "syllable_count": syllable_count, "lexicon_count": lexicon_count, "sentence_count": sentence_count, "flesch_reading_ease": flesch_reading_ease, "flesch_kincaid_grade": flesch_kincaid_grade, "gunning_fog": gunning_fog, "smog_index": smog_index, "automated_readability_index": automated_readability_index, "coleman_liau_index": coleman_liau_index, "linsear_write_formula": linsear_write_formula, "dale_chall_readability_score": dale_chall_readability_score, "text_standard": text_standard }
def _calculate_scores(self, docs): docs_scores = [] for doc in docs: scores = {} scores['chars'] = ts.char_count(doc) scores['words'] = ts.lexicon_count(doc) scores['sents'] = ts.sentence_count(doc) #scores['syllables'] = ts.syllable_count(doc) scores['avg_sent_length'] = ts.avg_sentence_length(doc) scores['avg_syllables_per_word'] = ts.avg_syllables_per_word(doc) scores['avg_letters_per_word'] = ts.avg_letter_per_word(doc) scores['flesch'] = ts.flesch_reading_ease(doc) #scores['smog'] = ts.smog_index(doc) #scores['coleman_liau'] = ts.coleman_liau_index(doc) scores['automated_readability'] = ts.automated_readability_index( doc) #scores['linsear'] = ts.linsear_write_formula(doc) #scores['difficult_words'] = ts.difficult_words(doc) scores['dale_chall'] = ts.dale_chall_readability_score(doc) #scores['gunning_fog'] = ts.gunning_fog(doc) scores['lix'] = ts.lix(doc) docs_scores.append(scores) return docs_scores
def automated_readability_index(text): """ :type text: Text :param text: The text to be analysed :rtype float :returns Automated readability index """ return textstat.automated_readability_index(text.text)
def vecify(v): return [ts.flesch_reading_ease(v), # ts.smog_index(v), ts.flesch_kincaid_grade(v), ts.coleman_liau_index(v), ts.automated_readability_index(v), ts.dale_chall_readability_score(v), ts.difficult_words(v), ts.linsear_write_formula(v), ts.gunning_fog(v)]
def all_trad_scores(text): fre = textstat.flesch_reading_ease(text) fkg = textstat.flesch_kincaid_grade(text) smog = textstat.smog_index(text) cole = textstat.coleman_liau_index(text) ari = textstat.automated_readability_index(text) dale = textstat.dale_chall_readability_score(text) linsear = textstat.linsear_write_formula(text) gunning = textstat.gunning_fog(text) return [fre, fkg, smog, cole, ari, dale, linsear, gunning]
def reading_difficulty(self): diff_words = textstat.difficult_words(self.text) / self.nword flesch_kincaid = textstat.flesch_kincaid_grade(self.text) coleman_liau = textstat.coleman_liau_index(self.text) ari = textstat.automated_readability_index(self.text) dale_chall = textstat.dale_chall_readability_score(self.text) linsear = textstat.linsear_write_formula(self.text) gunning_fog = textstat.gunning_fog(self.text) - 6 smog = textstat.smog_index(self.text) avg_grade = max( math.ceil((flesch_kincaid + coleman_liau + ari + dale_chall + linsear + gunning_fog + smog) / 7), 12) return avg_grade, diff_words
def textstat_analysis(profile_text): fre = textstat.flesch_reading_ease(profile_text) smog = textstat.smog_index(profile_text) fkg = textstat.flesch_kincaid_grade(profile_text) coleman = textstat.coleman_liau_index(profile_text) ari = textstat.automated_readability_index(profile_text) dale = textstat.dale_chall_readability_score(profile_text) dw = textstat.difficult_words(profile_text) lwf = textstat.linsear_write_formula(profile_text) gf = textstat.gunning_fog(profile_text) rc = textstat.readability_consensus(profile_text) word_count = textstat.lexicon_count(profile_text) return (fre, smog, fkg, coleman, ari, dale, dw, lwf, gf, rc, word_count)
def d(): resp = table.scan(FilterExpression=Attr("fips").exists()) sorted_by_fips = {} sorted_by_fips['items'] = [] for item in resp['Items']: concat = " ".join(item['tweet']) polarity = textstat.automated_readability_index(concat) sorted_by_fips['items'].append({ 'id': str(item['fips']), 'rate': polarity }) return json.dumps(sorted_by_fips)
def get_readability(contents): readability = [] readability.append(textstat.flesch_reading_ease(contents)) readability.append(textstat.smog_index(contents)) readability.append(textstat.flesch_kincaid_grade(contents)) readability.append(textstat.automated_readability_index(contents)) readability.append(textstat.dale_chall_readability_score(contents)) readability.append(textstat.difficult_words(contents)) readability.append(textstat.linsear_write_formula(contents)) readability.append(textstat.gunning_fog(contents)) readability.append(textstat.coleman_liau_index(contents)) readability.append(textstat.text_standard(contents)) return readability
def analyze_one(self, email): """ Analyzes a single email and stores results. """ sents = tstat.sentence_count(email) self.sent_count.append(sents if sents > 0 else 1) if email and len(email) > 0: self.flesch_kincaid_grade.append(tstat.flesch_kincaid_grade(email)) self.automated_readability_index.append( tstat.automated_readability_index(email)) self.coleman_liau_index.append(tstat.coleman_liau_index(email)) self.linsear_write_formula.append( tstat.linsear_write_formula(email)) self.dale_chall_readability_score.append( tstat.dale_chall_readability_score(email))
def main() : for arg in sys.argv[1:]: with open(arg) as f: text = f.read() with open(arg + '.readability.snip','w') as f: f.write ("syllable_count : %s\n" % textstat.syllable_count(text)) f.write ("lexicon_count : %s\n" % textstat.lexicon_count(text)) f.write ("sentence_count : %s\n" % textstat.sentence_count(text)) f.write ("difficult_words : %s\n" % textstat.difficult_words(text)) f.write ("flesch_reading_ease : %s\n" % textstat.flesch_reading_ease(text)) f.write ("flesch_kincaid_grade : %s\n" % textstat.flesch_kincaid_grade(text)) f.write ("smog_index : %s\n" % textstat.smog_index(text)) f.write ("automated_readability_index : %s\n" % textstat.automated_readability_index(text)) f.write ("coleman_liau_index : %s\n" % textstat.coleman_liau_index(text)) f.write ("linsear_write_formula : %s\n" % textstat.linsear_write_formula(text)) f.write ("dale_chall_readability_score : %s\n" % textstat.dale_chall_readability_score(text))
def analyse_readbility(self, issue): """TODO: Docstring for analyse_readbility. :issue: TODO :returns: TODO """ # Não realiza análise para uma issue sem 'body' # if not issue.body: # message = ' - [ ] To improve the readability of the text.\n' # return (None, message) gfm = GithubMarkdown(issue.body) str_markdown = gfm.parse(issue.body) str_text = self.markdown_to_text(str_markdown) dic_test_readbility = dict() if not issue.body: message = (" - [ ] To improve the text in issue body.\n") dic_test_readbility['flesch'] = -1 dic_test_readbility['ari'] = 100 dic_test_readbility['dale-chall'] = 100 return (dic_test_readbility, message) # Analisando a métrica Flesch Reading Ease Score score_flesch = textstat.flesch_reading_ease(str_text) dic_test_readbility['flesch'] = score_flesch # Analisando com o teste Automated Readability Index (ARI) score_ari = textstat.automated_readability_index(str_text) dic_test_readbility['ari'] = score_ari # Analisando com o teste Dale-Chall Readbility Score score_dale_chal = textstat.dale_chall_readability_score(str_text) dic_test_readbility['dale-chall'] = score_dale_chal if not issue.body: message = (" - [ ] To improve the text in issue body.\n") return (dic_test_readbility, message) if self._has_low_readbility(dic_test_readbility): message = ' - [ ] To improve the readability of the text.\n' else: message = None return (dic_test_readbility, message)
def run_textstat(text): #text = """Playing games has always been thought to be important to the development of well-balanced and creative children; however, what part, if any, they should play in the lives of adults has never been researched that deeply. I believe that playing games is every bit as important for adults as for children. Not only is taking time out to play games with our children and other adults valuable to building interpersonal relationships but is also a wonderful way to release built up tension.""" ts_flesch_reading_ease = textstat.flesch_reading_ease(text) ts_smog_index = textstat.smog_index(text) ts_flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) ts_coleman_liau_index = textstat.coleman_liau_index(text) ts_automated_readability_index = textstat.automated_readability_index(text) ts_dale_chall_readability_score = textstat.dale_chall_readability_score( text) ts_difficult_words = textstat.difficult_words(text) ts_linsear_write_formula = textstat.linsear_write_formula(text) ts_gunning_fog = textstat.gunning_fog(text) ts_text_standard = textstat.text_standard(text) return (ts_flesch_reading_ease, ts_smog_index, ts_flesch_kincaid_grade, ts_coleman_liau_index, ts_automated_readability_index, ts_dale_chall_readability_score, ts_difficult_words, ts_linsear_write_formula, ts_gunning_fog, ts_text_standard)
def predict_readability_level(text): #convert text from a list to data frame data = pd.DataFrame([text]) #create an empty data frame for features df = pd.DataFrame() #get text features for all rows in data df[['commas_number' , 'pronouns_number' , 'modal_verbs_number' , \ 'personal_pronouns_number' , 'wh_pronouns_number' , 'function_words_number' , \ 'VB_tags_number' , 'VBD_tags_number' , 'VBG_tags_number' , 'VBN_tags_number' , \ 'VBP_tags_number' , 'nouns_number' , 'proper_nouns_number' , 'conjunctions_number' , \ 'adjectives_number' , 'non_modal_verbs_number' , 'interjections_number' , \ 'adverbs_number' , 'determiners_number' ]] \ = data[0].apply(lambda x:pd.Series(text_features(x))) #calculate all readability equations for all rows in data df["Flesch_Reading_Ease_score"] = data[0].apply(lambda x:flesch_reading_ease(x)) df["Flesch_Kincaid_Grade_Level"] = data[0].apply(lambda x:textstat.flesch_kincaid_grade(x)) df["Fog_Scale"] = data[0].apply(lambda x:gunning_fog(x)) df["SMOG_Index"] = data[0].apply(lambda x:smog_index(x)) df["Automated_Readability_Index"] = data[0].apply(lambda x:textstat.automated_readability_index(x)) df["Coleman_Liau_Index"] = data[0].apply(lambda x:textstat.coleman_liau_index(x)) df["Linsear_Write_Formula"] = data[0].apply(lambda x:textstat.linsear_write_formula(x)) df["Dale_Chall_Readability_Score"] = data[0].apply(lambda x:dale_chall_readability_score(x)) #get text parameters used in readability equations for all rows in data df[['Word_count' , 'Sentence_count' , 'Average_Sentence_length' , \ 'Syllable_Count' , 'Average_syllables_per_words' , 'poly_syllable_count' , \ 'Lexical_Count' , 'average_poly_syllable' , 'long_word' , 'average_long_word' , \ 'average_word_length']] = data[0].apply(lambda x:pd.Series(text_param(x))) #Drop bad featurs df = df.drop('long_word',axis=1) df = df.drop('average_long_word',axis=1) #Load readability model for predition Readability_model=load_pkl("Readability_model.pkl") #Use model to predict the right class based on features result = Readability_model.predict(df) #return the result return str(result[0])
def automated_readability_index(text): score = textstat.automated_readability_index(text) level = 0 if 0 < score < 6: level = 1 elif 6 <= score < 8: level = 2 elif 8 <= score < 10: level = 3 elif 10 <= score < 11: level = 4 elif 11 <= score < 12: level = 5 elif 12 <= score < 13: level = 6 elif 13 <= score: level = 7 return level
def analyze2(text): # Automatically reject if no input if text.isspace(): return -1.0 if text.startswith('http'): return -1.0 # Analyze text try: x = textstat.automated_readability_index(text) except: return -1.0 # Keep outputs valid if not isinstance(x, float): return -1.0 if x < 0: return -1.0 return x
def get_feat_readability_metrics(self): # https://github.com/shivam5992/textstat try: test_data = self.webscrap.get_body() out = [] out.append(textstat.flesch_reading_ease(test_data)) out.append(textstat.smog_index(test_data)) out.append(textstat.flesch_kincaid_grade(test_data)) out.append(textstat.coleman_liau_index(test_data)) out.append(textstat.automated_readability_index(test_data)) out.append(textstat.dale_chall_readability_score(test_data)) out.append(textstat.difficult_words(test_data)) out.append(textstat.linsear_write_formula(test_data)) out.append(textstat.gunning_fog(test_data)) #out.append(textstat.text_standard(test_data)) return out, False except Exception as e: config.logger.error(repr(e)) return MISSING_FEATURE * 9, True
def lambda_handler(event, context): text = event['text'] response = {} response['flesch_reading_ease'] = textstat.flesch_reading_ease(text) response['smog_index'] = textstat.smog_index(text) response['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text) response['coleman_liau_index'] = textstat.coleman_liau_index(text) response[ 'automated_readability_index'] = textstat.automated_readability_index( text) response[ 'dale_chall_readability_score'] = textstat.dale_chall_readability_score( text) response['difficult_words'] = textstat.difficult_words(text) response['linsear_write_formula'] = textstat.linsear_write_formula(text) response['gunning_fog'] = textstat.gunning_fog(text) response['text_standard'] = textstat.text_standard(text) return respond(None, response)
def getFeatures(files): allStructuralLex = [] allFileData = [] allScores = [] allFres = [] allAri = [] df = files rowNum = df.shape[0] for i in range(rowNum): structural_lex = [] review = (df.iloc[i].reviewText).strip().split() summary = (df.iloc[i].summary).strip().split() row_data = np.append(review, summary) score = float(df.iloc[i].overall) fres = textstat.flesch_reading_ease(str(row_data)) lor = lengthOfReview(str(row_data)) sc = sentenceCount(str(row_data)) cc = charCount(str(row_data)) acc = allCapCount(str(row_data)) qc = questionCount(str(row_data)) bg = bigramCount(str(row_data)) ari = textstat.automated_readability_index(str(row_data)) review = [element.lower() for element in review] structural_lex.append(lor) structural_lex.append(sc) structural_lex.append(cc) structural_lex.append(acc) structural_lex.append(qc) structural_lex.append(bg) allFileData.append(review) allScores.append(score) allFres.append(fres) allAri.append(ari) allStructuralLex.append(structural_lex) return allFileData, allStructuralLex, allScores, allFres, allAri
def feature_readability(essay): syllable_count = textstat.syllable_count(essay) #音节数统计 flesch_reading_ease = textstat.flesch_reading_ease(essay) #文档的易读性0-100之间的分数 smog_index = textstat.smog_index(essay) #烟雾指数,反映文档的易读程度,更精确,更容易计算 flesch_kincaid_index = textstat.flesch_kincaid_grade(essay) #等级分数,年级等级 coleman_liau_index = textstat.coleman_liau_index(essay) #返回文本的年级级别 automated_readability_index = textstat.automated_readability_index(essay) #自动可读性指数,接近理解文本需要的年级 dale_chall_readability_score = textstat.dale_chall_readability_score(essay) #返回年级级别,使用最常见的英文单词 difficult_words = textstat.difficult_words(essay) linsear_write_formula = textstat.linsear_write_formula(essay) #返回文本的年级级别 gunning_fog = textstat.gunning_fog(essay) #迷雾指数, 反映文本的阅读难度 return syllable_count, flesch_reading_ease, smog_index, flesch_kincaid_index, coleman_liau_index, automated_readability_index, dale_chall_readability_score, difficult_words, linsear_write_formula, gunning_fog
def analyseText(): values = request.get_json() required = [ 'inputText' ] if not all(k in values for k in required): return 'Missing values', 400 text = values['inputText'] result = { 'syllable_count': textstat.syllable_count(text), 'lexicon_count': textstat.lexicon_count(text), 'sentence_count': textstat.sentence_count(text), 'flesch_reading_ease': textstat.flesch_reading_ease(text), 'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text), 'gunning_fog': textstat.gunning_fog(text), 'smog_index': textstat.smog_index(text), 'automated_readability_index': textstat.automated_readability_index(text), 'coleman_liau_index': textstat.coleman_liau_index(text), 'linsear_write_formula': textstat.linsear_write_formula(text), 'dale_chall_readability_score': textstat.dale_chall_readability_score(text) }; return jsonify(result), 200
def calculate_readability_measures(id): """ Count the words in doc and update the document. """ es = elasticsearch.Elasticsearch() source = es.get_source(index='beek', doc_type='page', id=id) # count = len(source['content'].split()) try: measures = { 'flesch': textstat.flesch_reading_ease(source['content']), 'smog': textstat.smog_index(source['content']), 'flesch_kincaid': textstat.flesch_kincaid_grade(source['content']), 'coleman_liau': textstat.coleman_liau_index(source['content']), 'readability': textstat.automated_readability_index(source['content']), 'dale_chall': textstat.dale_chall_readability_score(source['content']), 'difficult_words': textstat.difficult_words(source['content']), 'linsear_write_formula': textstat.linsear_write_formula(source['content']), 'gunning_fog': textstat.gunning_fog(source['content']), 'consensus': textstat.readability_consensus(source['content']), } es.update(index='beek', doc_type='page', id=id, body={'doc': { 'measures': measures }}, refresh=True) except Exception as err: pass
def getContext(files): allFileData = [] allScores = [] allFres = [] allAri = [] df = files rowNum = df.shape[0] for i in range(rowNum): review = (df.iloc[i].reviewText).strip().split() summary = (df.iloc[i].summary).strip().split() row_data = np.append(review, summary) score = float(df.iloc[i].overall) fres = textstat.flesch_reading_ease(str(row_data)) ari = textstat.automated_readability_index(str(row_data)) review = [element.lower() for element in review] allFileData.append(review) allScores.append(score) allFres.append(fres) allAri.append(ari) return allFileData, allScores, allFres, allAri
def calculate_readability_measures(id): """ Count the words in doc and update the document. """ es = elasticsearch.Elasticsearch() source = es.get_source(index='beek', doc_type='page', id=id) # count = len(source['content'].split()) try: measures = { 'flesch': textstat.flesch_reading_ease(source['content']), 'smog': textstat.smog_index(source['content']), 'flesch_kincaid': textstat.flesch_kincaid_grade(source['content']), 'coleman_liau': textstat.coleman_liau_index(source['content']), 'readability': textstat.automated_readability_index(source['content']), 'dale_chall': textstat.dale_chall_readability_score(source['content']), 'difficult_words': textstat.difficult_words(source['content']), 'linsear_write_formula': textstat.linsear_write_formula(source['content']), 'gunning_fog': textstat.gunning_fog(source['content']), 'consensus': textstat.readability_consensus(source['content']), } es.update(index='beek', doc_type='page', id=id, body={'doc': {'measures': measures}}, refresh=True) except Exception as err: pass
def text_stats(corpus): tk = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize toks = [tk(entry) for entry in corpus] # get function words with open('function_words.txt') as file: funcs = file.read().split(',') funcs = [f.strip() for f in funcs] amb = ambiguity(toks) # calculate ambiguity matrix = [["Chars/Word", "Lexical Diversity", "Lexical Density", "Function Words", "Syllables", "ARI"]] for tokens, sentence in zip(toks, corpus): unique = set(tokens) avchar = 0 lexdiv = 0 lexden = 0 nfunc = 0 numsyl = 0 ari = 0 if len(sentence) > 1: lexdiv = len(unique) / len(tokens) # Lexical Diversity lexden = len([x for x in tokens if x not in funcs]) / len(tokens) # Lexical Density numsyl = textstat.syllable_count(sentence) / len(tokens) / 10 # Number of syllables in text # may be a bit dodgy without punctuation ari = abs(textstat.automated_readability_index(sentence)) / 14 # Automated Readability index for t in tokens: avchar += len(t) / len(tokens) / len(sentence) # Average num chars if t in funcs: nfunc += 1 / len(tokens) # Number of function words matrix.append([avchar, lexdiv, lexden, nfunc, numsyl, ari]) matrix = [m + [a] for m, a in zip(matrix, amb)] return np.array(matrix)
def stats(self, text): test_data = text stats = {} stats['flesch_reading_ease'] = textstat.flesch_reading_ease(test_data) stats['smog'] = textstat.smog_index(test_data) stats['flesch kincaid'] = textstat.flesch_kincaid_grade(test_data) stats['coleman Liau'] = textstat.coleman_liau_index(test_data) stats['automated'] = textstat.automated_readability_index(test_data) stats['dale chall'] = textstat.dale_chall_readability_score(test_data) stats['difficult'] = textstat.difficult_words(test_data) stats['linsear'] = textstat.linsear_write_formula(test_data) stats['gunning_fog'] = textstat.gunning_fog(test_data) stats['standard'] = textstat.text_standard(test_data) stats['charcount'] = textstat.char_count(test_data) stats['lexicon count'] = textstat.lexicon_count(test_data) stats['syllable count'] = textstat.syllable_count(test_data) stats['sentence count'] = textstat.sentence_count(test_data) stats['avg sentence length'] = textstat.avg_sentence_length(test_data) stats['avg_syllables_per_word'] = textstat.avg_syllables_per_word( test_data) stats['avg_letter_per_word'] = textstat.avg_letter_per_word(test_data) stats['avg_sentence_per_word'] = textstat.avg_sentence_per_word( test_data) return stats
def process(data): res = np.array([]) cleaned = data.lower().strip() original = data.strip() fea1 = numOfWords(cleaned) # fea1 = fea1 / 10 fea2 = numOfChar(cleaned) # fea2 = fea2 / 100 fea3 = count(cleaned, string.punctuation) fea5 = numOfContUpperCase(original) fea4 = textstat.gunning_fog(data) fea6 = textstat.automated_readability_index(data) fea7 = textstat.linsear_write_formula(data) fea8 = textstat.difficult_words(data) fea9 = textstat.dale_chall_readability_score(data) fea10 = data.count("\'") + data.count(".") + data.count("\"") + data.count(",") + data.count( "’") + data.count("‘") + data.count("”") + data.count("“") fea10 = (fea10 / len(data)) * 1000 fea11 = data.count("1") + data.count("2") + data.count("3") + data.count("4") + data.count( "5") + data.count("6") + data.count("7") + data.count("8") + data.count("9") + data.count("0") fea12 = data.count("?") + data.count("!") + data.count("@") + data.count("#") + data.count( "$") + data.count("%") + data.count("&") fea13 = data.count(":") + data.count(";") fea14 = data.count("—") + data.count("-") + data.count("_") fea15 = (fea10 / len(data)) * 100 fea16 = data.count("(") + data.count(")") + data.count("[") + data.count("]") + data.count( "{") + data.count("}") fea17 = data.count("*") + data.count("/") fea18 = data.count("?") fea19 = fea10 + fea11 + fea12 + fea13 + fea14 + fea15 + fea16 + fea17 + fea18 res = np.array([[fea1, fea2, fea3, fea5, fea4, fea6, fea7, fea8, fea9, fea10, fea11, fea12, fea13, fea14, fea15, fea16, fea17, fea18, fea19]]) return res
def get_readability(self, corpus, type='ari'): readability = None if type == 'ari': readability = textstat.automated_readability_index(corpus) elif type == 'flesch': readability = textstat.flesch_reading_ease(corpus) elif type == 'smog': readability = textstat.smog_index(corpus) elif type == 'flesch_kinciad': readability = textstat.flesch_kincaid_grade(corpus) elif type == 'coleman': readability = textstat.coleman_liau_index(corpus) elif type == 'dale_chall': readability = textstat.dale_chall_readability_score(corpus) elif type == 'difficult_words': readability = textstat.difficult_words(corpus) elif type == 'linsear': readability = textstat.linsear_write_formula(corpus) elif type == 'gunning_fog': readability = textstat.gunning_fog(corpus) elif type == 'readability_conensus': readability = textstat.readability_consensus(corpus) return readability
def __init__(self, path): """ Create document instance for analysis. Opens and reads document to string raw_text. Textract interprets the document format and opens to plain text string (docx, pdf, odt, txt) Args: path (str): path to file to open, anaylze, close Public attributes: -user: (str) optional string to set username. -path: (str) relative path to document. -abs_path: (str) the absolute path to the document. -file_name: (str) the file name with extension of document (base name). -mime: tbd -guessed_type: makes best guess of mimetype of document. -file_type: returns index[0] from guessed_type. -raw_text: (str) plain text extracted from .txt, .odt, .pdf, .docx, and .doc. -ptext: (str) raw text after a series of regex expressions to eliminate special characters. -text_no_feed: (str) ptext with most new line characters eliminated /n/n stays intact. -sentence_tokens: list of all sentences in a comma separated list derived by nltk. -sentence_count: (int) count of sentences found in list. -passive_sentences: list of passive sentences identified by the passive module. -passive_sentence_count: count of the passive_sentences list. -percent_passive: (float) ratio of passive sentences to all sentences in percent form. -be_verb_analysis: (int) sum number of occurrences of each to be verb (am, is, are, was, were, be, being been). -be_verb_count: tbd -be_verb_analysis: tbd -weak_sentences_all: (int) sum of be verb analysis. -weak_sentences_set: (set) set of all sentences identified as having to be verbs. -weak_sentences_count: (int) count of items in weak_sentences_set. -weak_verbs_to_sentences: (float) proportion of sentences with to be to all sentences in percent (this might not be sound). -word_tokens: list of discreet words in text that breaks contractions up (default nltk tokenizer). -word_tokens_no_punct: list of all words in text including contractions but otherwise no punctuation. -no_punct: (str) full text string without sentence punctuation. -word_tokens_no_punct: uses white-space tokenizer to create a list of all words. -readability_flesch_re: (int) Flesch Reading Ease Score (numeric score) made by textstat module. -readability_smog_index: (int) grade level as determined by the SMOG algorithum made by textstat module. -readability_flesch_kincaid_grade: (int) Flesch-Kincaid grade level of reader made by textstat module. -readability_coleman_liau_index: (int) grade level of reader as made by textstat module. -readability_ari: (int) grade leader of reader determined by automated readability index algorithum implemented by textstat. -readability_linser_write: FIX SPELLING grade level as determined by Linsear Write algorithum implemented by textstat. -readability_dale_chall: (int) grade level based on Dale-Chall readability as determined by textstat. -readability_standard: composite grade level based on readability algorithums. -flesch_re_key: list for interpreting Flesch RE Score. -word_count: word count of document based on white space tokener, this word count should be used. -page_length: (float) page length in decimal format given 250 words per page. -paper_count: (int) number of printed pages given 250 words per page. -parts_of_speech: words with parts of speech tags. -pos_counts: values in word, tag couple grouped in a list (Counter). -pos_total: (int) sum of pos_counts values -pos_freq: (dict) word, ratio of whole -doc_pages: (float) page length based on 250 words per page (warning, this is the second time this attribute is defined). -freq_words: word frequency count not standardized based on the correct word tokener (not ratio, just count). modal_dist: count of auxillary verbs based on word_tokens_no_punct. sentence_count (int): Count the sentence tokens passive_sentences (list): List of all sentences identified as passive passive_sentence_count (int): count of items in passive_sentences be_verb_count (int): count "to be" verbs in text word_tokens_no_punct (list): words separated, stripped of punctuation, made lower case flesch_re_key (str): reading ease score to description freq_words (list or dict): frequency distribution of all words modal_dist (list): frequency distribution of aux verbs """ self.user = "" self.path = path self.abs_path = os.path.abspath(self.path) if os.path.isfile(self.path): self.time_stamp = self.timestamp() self.file_name = os.path.basename(path) self.mime = MimeTypes() self.guessed_type = self.mime.guess_type(self.path) self.file_type = self.guessed_type[0] self.raw_text = textract.process(self.path, encoding="ascii") self.ptext = re.sub(u'[\u201c\u201d]', '"', self.raw_text) self.ptext = re.sub(u"\u2014", "--", self.ptext) self.ptext = re.sub(",", ",", self.ptext) self.ptext = re.sub("—", "--", self.ptext) self.ptext = re.sub("…", "...", self.ptext) self.text_no_feed = self.clean_new_lines(self.ptext) self.sentence_tokens = self.sentence_tokenize(self.text_no_feed) self.sentence_count = len(self.sentence_tokens) self.passive_sentences = passive(self.text_no_feed) self.passive_sentence_count = len(self.passive_sentences) self.percent_passive = (100 * (float(self.passive_sentence_count) / float(self.sentence_count))) self.percent_passive_round = round(self.percent_passive, 2) self.be_verb_analysis = self.count_be_verbs(self.sentence_tokens) self.be_verb_count = self.be_verb_analysis[0] self.weak_sentences_all = self.be_verb_analysis[1] self.weak_sentences_set = set(self.weak_sentences_all) self.weak_sentences_count = len(self.weak_sentences_set) self.weak_verbs_to_sentences = 100 * float( self.weak_sentences_count) / float(self.sentence_count) self.weak_verbs_to_sentences_round = round( self.weak_verbs_to_sentences, 2) self.word_tokens = self.word_tokenize(self.text_no_feed) self.word_tokens_no_punct = \ self.word_tokenize_no_punct(self.text_no_feed) self.no_punct = self.strip_punctuation(self.text_no_feed) # use this! It make lower and strips symbols self.word_tokens_no_punct = self.ws_tokenize(self.no_punct) self.readability_flesch_re = \ textstat.flesch_reading_ease(self.text_no_feed) self.readability_smog_index = \ textstat.smog_index(self.text_no_feed) self.readability_flesch_kincaid_grade = \ textstat.flesch_kincaid_grade(self.text_no_feed) self.readability_coleman_liau_index = \ textstat.coleman_liau_index(self.text_no_feed) self.readability_ari = \ textstat.automated_readability_index(self.text_no_feed) self.readability_linser_write = \ textstat.linsear_write_formula(self.text_no_feed) self.readability_dale_chall = \ textstat.dale_chall_readability_score(self.text_no_feed) self.readability_standard = \ textstat.text_standard(self.text_no_feed) self.flesch_re_desc_str = self.flesch_re_desc( int(textstat.flesch_reading_ease(self.text_no_feed))) self.polysyllabcount = textstat.polysyllabcount(self.text_no_feed) self.lexicon_count = textstat.lexicon_count(self.text_no_feed) self.avg_syllables_per_word = textstat.avg_syllables_per_word( self.text_no_feed) self.avg_sentence_per_word = textstat.avg_sentence_per_word( self.text_no_feed) self.avg_sentence_length = textstat.avg_sentence_length( self.text_no_feed) self.avg_letter_per_word = textstat.avg_letter_per_word( self.text_no_feed) self.difficult_words = textstat.difficult_words(self.text_no_feed) self.rand_passive = self.select_random(self.passive_sentence_count, self.passive_sentences) self.rand_weak_sentence = self.select_random( len(self.weak_sentences), self.weak_sentences) if self.word_tokens_no_punct: self.word_count = len(self.word_tokens_no_punct) self.page_length = float(self.word_count) / float(250) self.paper_count = int(math.ceil(self.page_length)) self.parts_of_speech = pos_tag(self.word_tokens_no_punct) self.pos_counts = Counter( tag for word, tag in self.parts_of_speech) self.pos_total = sum(self.pos_counts.values()) self.pos_freq = dict( (word, float(count) / self.pos_total) for word, count in self.pos_counts.items()) self.doc_pages = float(float(self.word_count) / float(250)) self.freq_words = \ self.word_frequency(self.word_tokens_no_punct) self.modal_dist = self.modal_count(self.word_tokens_no_punct) # self.ws_tokens = self.ws_tokenize(self.text_no_cr) self.pos_count_dict = self.pos_counts.items() # Model - use for any pos self.modals = self.pos_isolate('MD', self.pos_count_dict) self.preposition_count = self.pos_isolate('IN', self.pos_count_dict) self.adjective_count = self.pos_isolate_fuzzy( 'JJ', self.pos_count_dict) self.adverb_count = self.pos_isolate_fuzzy('RB', self.pos_count_dict) self.proper_nouns = self.pos_isolate_fuzzy('NNP', self.pos_count_dict) self.cc_count = self.pos_isolate('CC', self.pos_count_dict) self.commas = self.char_count(",") self.comma_sentences = self.list_sentences(",") self.comma_example = self.select_random(len(self.comma_sentences), self.comma_sentences) self.semicolons = self.char_count(";") self.semicolon_sentences = self.list_sentences(";") self.semicolon_example = self.select_random( len(self.semicolon_sentences), self.semicolon_sentences) self.lint_suggestions = lint(self.raw_text)
def updateData(self): # Full list of polarity scores self.polscore = self.sid.polarity_scores(self.text) ##### INDEX 0 IN DATA: Text Sentiment ##### # [INDEX 0] Compounded score (0.0 - 1.0) [INDEX 1] Negative connotation rating (0.0 - 1.0), # [INDEX 2] Positive connotation rating (0.0 - 1.0) [INDEX 3] Neutral connotation rating (0.0 - 1.0) self.data.append([ self.polscore['compound'], self.polscore['neg'], self.polscore['pos'], self.polscore['neu'] ]) ##### INDEX 1 IN DATA: Sentence Info ##### # [INDEX 0] Sentence count [INDEX 1] Average sentence length # [INDEX 2] Syllable count [INDEX 3] Overall word count # [INDEX 4] Character count [INDEX 5] Character count without spaces # [INDEX 6] Avg letters per word [INDEX 7] Avg syllables per word self.data.append([ textstat.sentence_count(self.text), textstat.avg_sentence_length(self.text), textstat.syllable_count(self.text), len(self.splList), textstat.char_count(self.text, False), textstat.char_count(self.text, True), textstat.avg_letter_per_word(self.text), textstat.avg_syllables_per_word(self.text) ]) ##### INDEX 2 IN DATA: Flesch Reading Ease ##### # [INDEX 0] Pure score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 100 self.freRaw = textstat.flesch_reading_ease(self.text) self.freStat = min(max(self.freRaw, 0), 100) self.data.append([ round(self.freStat, 3), self.freGrade(self.freStat), round(abs(self.freStat - 100), 2) ]) ##### INDEX 3 IN DATA: Flesch-Kincaid Grade ##### # [INDEX 0] Pure score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.fkgRaw = textstat.flesch_kincaid_grade(self.text) self.fkgStat = self.adjustScore(self.fkgRaw) self.data.append([ round(self.fkgStat, 3), self.grade(self.fkgStat), round(self.fkgStat / 0.18, 2) ]) ##### INDEX 4 IN DATA: Gunning FOG Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.fogRaw = textstat.gunning_fog(self.text) self.fogStat = self.adjustScore(self.fogRaw) self.data.append([ round(self.fogStat, 3), self.grade(self.fogStat), round(self.fogStat / 0.18, 2) ]) ##### INDEX 5 IN DATA: SMOG Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.smogRaw = textstat.smog_index(self.text) self.smogStat = self.adjustScore(self.smogRaw) self.data.append([ round(self.smogStat, 3), self.grade(self.smogStat), round(self.smogStat / 0.18, 2) ]) ##### INDEX 6 IN DATA: Automated Readability Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 14 self.ariRaw = textstat.automated_readability_index(self.text) self.ariStat = min(max(self.ariRaw, 0), 14) self.data.append([ round(self.ariStat, 3), self.ariGrade(ceil(self.ariStat)), round(self.ariStat / 0.14, 2) ]) #13 ##### INDEX 7 IN DATA: Coleman-Liau Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.cliRaw = textstat.coleman_liau_index(self.text) self.cliStat = self.adjustScore(self.cliRaw) self.data.append([ round(self.cliStat, 3), self.grade(self.cliStat), round(self.cliStat / 0.18, 2) ]) ##### INDEX 8 IN DATA: Linsear Write Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.lwiRaw = textstat.linsear_write_formula(self.text) self.lwiStat = self.adjustScore(self.lwiRaw) self.data.append([ round(self.lwiStat, 3), self.grade(self.lwiStat), round(self.lwiStat / 0.18, 2) ]) ##### INDEX 9 IN DATA: Dale-Chall Readability Score ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 10 self.dcrRaw = textstat.dale_chall_readability_score(self.text) self.dcrStat = min(max(self.dcrRaw, 0), 10) self.data.append([ round(self.dcrStat, 3), self.daleChallGrade(self.dcrStat), round(self.dcrStat / 0.1, 2) ]) ##### INDEX 10 IN DATA: Overall Score ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 20 self.txtRaw = textstat.text_standard(self.text, True) self.txtStd = min(max(self.txtRaw, 0), 20) self.txtInfo = textstat.text_standard(self.text) self.data.append([ round(self.txtStd, 3), self.txtGrade(self.txtStd, self.txtInfo), round(self.txtStd / 0.2, 2) ]) return self.data
def get_textstats(text): return textstat.sentence_count(text), textstat.automated_readability_index(text), textstat.flesch_reading_ease(text)
#main script if __name__ == '__main__': print "TextStat Comparison Script" print "--------------------------" #read in text from the command line #This needs to be fixed to deal/escape special characters textToCheck = raw_input("Please enter the text you would like to analyse: ") #read in text from a file- but what format? print "\n\n" print "Results" print "==============================================" print "==============================================\n" print "Syllable Count: " + str(textstat.syllable_count(textToCheck)) print "Lexicon Count: " + str(textstat.lexicon_count(textToCheck)) #TRUE is default and removes punctuation before counting print "Sentence Count: " + str(textstat.sentence_count(textToCheck)) print "Flesch Reading Ease formula: " + str(textstat.flesch_reading_ease(textToCheck)) print "Flesch-Kincaid Grade Level: " + str(textstat.flesch_kincaid_grade(textToCheck)) print "Fog Scale (Gunning FOG Formula): " + str(textstat.gunning_fog(textToCheck)) print "SMOG Index: " + str(textstat.smog_index(textToCheck)) print "Automated Readability Index: " + str(textstat.automated_readability_index(textToCheck)) print "Coleman-Liau Index: " + str(textstat.coleman_liau_index(textToCheck)) print "Linsear Write Formula: " + str(textstat.linsear_write_formula(textToCheck)) print "Dale-Chall Readability Score: " + str(textstat.dale_chall_readability_score(textToCheck)) print "--------------------------------------------------------------" print "Readability Consensus based upon all the above tests: " + str(textstat.text_standard(textToCheck)) print "\n\n"
# skipping tweets which are not just contextbased text. if textstat.sentence_count(tweet) < 1: continue flesch_kincaid_grade = textstat.flesch_kincaid_grade(tweet) flesch_kincaid_grades.append(flesch_kincaid_grade) flesch_kincaid_total_grade += flesch_kincaid_grade gunning_fog_grade = textstat.gunning_fog(tweet) gunning_fog_grades.append(gunning_fog_grade) gunning_fog_total_grade += gunning_fog_grade smog_index_grade = textstat.smog_index(tweet) smog_index_grades.append(smog_index_grade) smog_index_total_grade += smog_index_grade ar_index_grade = textstat.automated_readability_index(tweet) ar_index_grades.append(ar_index_grade) ar_index_total_grade += ar_index_grade cl_index_grade = textstat.coleman_liau_index(tweet) cl_index_grades.append(cl_index_grade) cl_index_total_grade += cl_index_grade lwf_grade = textstat.linsear_write_formula(tweet) lwf_grades.append(lwf_grade) lwf_total_grade += lwf_grade dcr_grade = textstat.dale_chall_readability_score(tweet) dcr_grades.append(dcr_grade) dcr_total_grade += dcr_grade
# print("The SMOG Index") # print("Texts of fewer than 30 sentences are statistically invalid, " # "because the SMOG formula was normed on 30-sentence samples.") # print("textstat requires atleast 3 sentences for a result.") # print(textstat.smog_index(test_data)) print("The Flesch-Kincaid Grade") # print(textstat.flesch_kincaid_grade(test_data)) flesch_kincaid_grade = textstat.flesch_kincaid_grade(test_data) print(flesch_kincaid_grade) print("The Coleman-Liau Index") # print(textstat.coleman_liau_index(test_data)) coleman_liau_index = textstat.coleman_liau_index(test_data) print(coleman_liau_index) print("Automated Readability Index (ARI)") # print(textstat.automated_readability_index(test_data)) automated_readability_index = textstat.automated_readability_index( test_data) print(automated_readability_index) # print("Dale-Chall Readability Score") # print(textstat.dale_chall_readability_score(test_data)) print("Linsear Write Formula") # print(textstat.linsear_write_formula(test_data)) linsear_write_formula = textstat.linsear_write_formula(test_data) print(linsear_write_formula) print("The Fog Scale (Gunning FOG Formula)") # print(textstat.gunning_fog(test_data)) gunning_fog = textstat.gunning_fog(test_data) print(gunning_fog) print( "---------------------------------Summary----------------------------------" )
#!/bin/python import sys, string, os from textstat.textstat import textstat inputfile = '' test_data = "" script_name = sys.argv[0] inputfile = sys.argv[1] with open(inputfile) as myfile: test_data="".join(line.rstrip() for line in myfile) var1 = str(textstat.flesch_reading_ease(test_data)) var2 = str(textstat.smog_index(test_data)) var3 = str(textstat.flesch_kincaid_grade(test_data)) var4 = str(textstat.coleman_liau_index(test_data)) var5 = str(textstat.automated_readability_index(test_data)) var6 = str(textstat.dale_chall_readability_score(test_data)) var7 = str(textstat.difficult_words(test_data)) var8 = str(textstat.linsear_write_formula(test_data)) var9 = str(textstat.gunning_fog(test_data)) var10 = str(textstat.readability_consensus(test_data)) var11 = str(textstat.syllable_count(test_data)) var12 = str(textstat.lexicon_count(test_data, 1)) var13 = str(textstat.sentence_count(test_data)) print(var1 + ',' + var2 + ',' + var3 + ',' + var4 + ',' + var5 + ',' + var6 + ',' + var7 + ',' + var8 + ',' + var9 + ',' + var10 + ',' + var11 + ',' + var12 + ',' + var13)
def extract_features_sub(text, dialogue = True): ## aggregate all dialogue, action #scenes = format_script(file_name) if len(text) > 0: try: language_complexity = {'flesch_reading_ease': textstat.flesch_reading_ease(text), 'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text), 'automated_readability_index': textstat.automated_readability_index(text)} except: language_complexity = {'flesch_reading_ease': None, 'flesch_kincaid_grade': None, 'automated_readability_index': None} else: #badD.write(movie_name + "\n") language_complexity = {'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0, 'automated_readability_index': 0} lexical_diversity = find_lex_d(text) sentiment = extract_senti_wordnet(text) #print sentiment inquirer_features = general_inquirer_features(text) final_features = {} final_features.update(language_complexity) final_features.update(lexical_diversity) final_features.update(sentiment) final_features.update(inquirer_features) curr_keys = [feature for feature in final_features] if dialogue: new_keys = [feature + "_" + "dialogue" for feature in final_features] else: new_keys = [feature + "_" + "action" for feature in final_features] #print final_features """ if dialogue: for feature in final_features: final_features[feature + "_dialogue"] = final_features.pop(feature) else: for feature in final_features: final_features[feature + "_action"] = final_features.pop(feature) #final_features = language_complexity + lexical_diversity + sentiment + inquirer_features """ return convert(final_features, dict(zip(curr_keys, new_keys)))