def get_text_features(article_contents: str) -> dict: """ Takes an article's contents and analyzes its complexity using numerous reading scores and methods. Also calculates other factors such as the number of typos. @param article_contents, a string which contains the contents of an article @return language_analysis_dict, a dictionary which contains """ tool = language_check.LanguageTool('en-US') language_analysis_dict = { "flesch_reading": textstat.flesch_reading_ease(article_contents), "flesch_kincaid": textstat.flesch_kincaid_grade(article_contents), "coleman_liau": textstat.coleman_liau_index(article_contents), "typos_to_words": len(tool.check(article_contents)) / textstat.lexicon_count(article_contents), "percent_difficult_words": textstat.difficult_words(article_contents) / textstat.lexicon_count(article_contents), } return language_analysis_dict
def readability(text): print("Readability\n=================================\n\n") print("Flesch Reading Ease\n________________________\n\n") print str(textstat.flesch_reading_ease(text)) + "\n" print("Smog Index\n________________________\n\n") print str(textstat.smog_index(text)) + "\n" print("Flesch Kincaid Grade\n________________________\n\n") print str(textstat.flesch_kincaid_grade(text)) + "\n" print("Coleman Liau Index\n________________________\n\n") print str(textstat.coleman_liau_index(text)) + "\n" print("ARI\n________________________\n\n") print str(textstat.automated_readability_index(text)) + "\n" print("Dale Chall\n________________________\n\n") print str(textstat.dale_chall_readability_score(text)) + "\n" print("Difficult Words\n________________________\n\n") print str(textstat.difficult_words(text)) + "\n" print("Linsear Write Formula\n________________________\n\n") print str(textstat.linsear_write_formula(text)) + "\n" print("Gunning Fog\n________________________\n\n") print str(textstat.gunning_fog(text)) + "\n" print "Compiled Score\n_____________________________\n\n" print str(textstat.text_standard(text)) + "\n" return len(adjectives)
def get_readability(df2): df = df2.copy() text_feats = df.select_dtypes(include=['object']).columns.values for i, col in enumerate(text_feats): df['flesch_reading_ease{}'.format(i)] = df[col].apply( lambda x: textstat.flesch_reading_ease(x)) df['smog_index{}'.format(i)] = df[col].apply( lambda x: textstat.smog_index(x)) df['flesch_kincaid_grade{}'.format(i)] = df[col].apply( lambda x: textstat.flesch_kincaid_grade(x)) df['coleman_liau_index{}'.format(i)] = df[col].apply( lambda x: textstat.coleman_liau_index(x)) df['automated_readability_index{}'.format(i)] = df[col].apply( lambda x: textstat.automated_readability_index(x)) df['dale_chall_readability_score{}'.format(i)] = df[col].apply( lambda x: textstat.dale_chall_readability_score(x)) df['difficult_words{}'.format(i)] = df[col].apply( lambda x: textstat.difficult_words(x)) df['linsear_write_formula{}'.format(i)] = df[col].apply( lambda x: textstat.linsear_write_formula(x)) df['gunning_fog{}'.format(i)] = df[col].apply( lambda x: textstat.gunning_fog(x)) df['text_standard{}'.format(i)] = df[col].apply( lambda x: textstat.text_standard(x)) return df
def features_tokenized_difficultword_title(cls, row): data = row['review_title'] token = cls.tokenizer.tokenize(data) if(token == None or len(token)==0): return -1 else: val = textstat.difficult_words(row['review_title']) return val
def vecify(v): return [ts.flesch_reading_ease(v), # ts.smog_index(v), ts.flesch_kincaid_grade(v), ts.coleman_liau_index(v), ts.automated_readability_index(v), ts.dale_chall_readability_score(v), ts.difficult_words(v), ts.linsear_write_formula(v), ts.gunning_fog(v)]
def textstat_analysis(profile_text): fre = textstat.flesch_reading_ease(profile_text) smog = textstat.smog_index(profile_text) fkg = textstat.flesch_kincaid_grade(profile_text) coleman = textstat.coleman_liau_index(profile_text) ari = textstat.automated_readability_index(profile_text) dale = textstat.dale_chall_readability_score(profile_text) dw = textstat.difficult_words(profile_text) lwf = textstat.linsear_write_formula(profile_text) gf = textstat.gunning_fog(profile_text) rc = textstat.readability_consensus(profile_text) word_count = textstat.lexicon_count(profile_text) return (fre, smog, fkg, coleman, ari, dale, dw, lwf, gf, rc, word_count)
def reading_difficulty(self): diff_words = textstat.difficult_words(self.text) / self.nword flesch_kincaid = textstat.flesch_kincaid_grade(self.text) coleman_liau = textstat.coleman_liau_index(self.text) ari = textstat.automated_readability_index(self.text) dale_chall = textstat.dale_chall_readability_score(self.text) linsear = textstat.linsear_write_formula(self.text) gunning_fog = textstat.gunning_fog(self.text) - 6 smog = textstat.smog_index(self.text) avg_grade = max( math.ceil((flesch_kincaid + coleman_liau + ari + dale_chall + linsear + gunning_fog + smog) / 7), 12) return avg_grade, diff_words
def get_readability(contents): readability = [] readability.append(textstat.flesch_reading_ease(contents)) readability.append(textstat.smog_index(contents)) readability.append(textstat.flesch_kincaid_grade(contents)) readability.append(textstat.automated_readability_index(contents)) readability.append(textstat.dale_chall_readability_score(contents)) readability.append(textstat.difficult_words(contents)) readability.append(textstat.linsear_write_formula(contents)) readability.append(textstat.gunning_fog(contents)) readability.append(textstat.coleman_liau_index(contents)) readability.append(textstat.text_standard(contents)) return readability
def features_linsear_title(cls, row): data = row['review_title'] token = cls.tokenizer.tokenize(data) if(token == None or len(token)==0): return 0 # val = textstat.smog_index(row['review_title']) # #-0.00216594769872 # val = textstat.dale_chall_readability_score(row['review_title']) # # 0.025131347883 val = textstat.difficult_words(row['review_title']) # # 0.0363778452286 # val = textstat.linsear_write_formula(row['review_title']) # #-0.00557553587525 # val = textstat.gunning_fog(row['review_title']) # 0.0202643684371 # val = textstat.flesch_reading_ease(row['review_title']) # -0.0207657385707 # val = textstat.automated_readability_index(row['review_title']) ##0.0142244017091 if(val != None): return val else: return 0
def main() : for arg in sys.argv[1:]: with open(arg) as f: text = f.read() with open(arg + '.readability.snip','w') as f: f.write ("syllable_count : %s\n" % textstat.syllable_count(text)) f.write ("lexicon_count : %s\n" % textstat.lexicon_count(text)) f.write ("sentence_count : %s\n" % textstat.sentence_count(text)) f.write ("difficult_words : %s\n" % textstat.difficult_words(text)) f.write ("flesch_reading_ease : %s\n" % textstat.flesch_reading_ease(text)) f.write ("flesch_kincaid_grade : %s\n" % textstat.flesch_kincaid_grade(text)) f.write ("smog_index : %s\n" % textstat.smog_index(text)) f.write ("automated_readability_index : %s\n" % textstat.automated_readability_index(text)) f.write ("coleman_liau_index : %s\n" % textstat.coleman_liau_index(text)) f.write ("linsear_write_formula : %s\n" % textstat.linsear_write_formula(text)) f.write ("dale_chall_readability_score : %s\n" % textstat.dale_chall_readability_score(text))
def features_difficultword_content(cls, row): data = row['review_content'] token = cls.tokenizer.tokenize(data) if(token == None or len(token)==0): return 0 # val = textstat.smog_index(row['review_content']) # 0.0970957994444 # val = textstat.dale_chall_readability_score(row['review_content']) # 0.0655506963852 val = textstat.difficult_words(row['review_content']) # 0.119689698366 # val = textstat.linsear_write_formula(row['review_content']) # 0.0393165149095 # val = textstat.gunning_fog(row['review_content']) # 0.064893772836 # val = textstat.flesch_reading_ease(row['review_content']) # -0.000962802863895 # val = textstat.automated_readability_index(row['review_content']) #0.0206780263383 if(val != None): return val else: return 0
def scores_cal_ori(text): char_count_value=textstat.char_count(text,ignore_spaces=True) lexicon_count_value=textstat.lexicon_count(text,removepunct=True) syllable_count_value=textstat.syllable_count(text) sentence_count_value=textstat.sentence_count(text) avg_sentence_length_value=textstat.avg_sentence_length(text) avg_syllables_per_word_value=textstat.avg_syllables_per_word(text) avg_letter_per_word_value=textstat.avg_letter_per_word(text) avg_sentence_per_word_value=textstat.avg_sentence_per_word(text) flesch_kincaid_grade_value=textstat.flesch_kincaid_grade(text) smog_index_value=textstat.smog_index(text) gunning_fog_value=textstat.gunning_fog(text) difficult_words_value=textstat.difficult_words(text) dale_chall_value=textstat.dale_chall_readability_score(text) polysyllab_value=textstat.polysyllabcount(text) return char_count_value,lexicon_count_value,syllable_count_value,sentence_count_value,avg_sentence_length_value,avg_syllables_per_word_value,avg_letter_per_word_value,avg_sentence_per_word_value,flesch_kincaid_grade_value,smog_index_value,gunning_fog_value,difficult_words_value,dale_chall_value,polysyllab_value return smog_index_value
def run_textstat(text): #text = """Playing games has always been thought to be important to the development of well-balanced and creative children; however, what part, if any, they should play in the lives of adults has never been researched that deeply. I believe that playing games is every bit as important for adults as for children. Not only is taking time out to play games with our children and other adults valuable to building interpersonal relationships but is also a wonderful way to release built up tension.""" ts_flesch_reading_ease = textstat.flesch_reading_ease(text) ts_smog_index = textstat.smog_index(text) ts_flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) ts_coleman_liau_index = textstat.coleman_liau_index(text) ts_automated_readability_index = textstat.automated_readability_index(text) ts_dale_chall_readability_score = textstat.dale_chall_readability_score( text) ts_difficult_words = textstat.difficult_words(text) ts_linsear_write_formula = textstat.linsear_write_formula(text) ts_gunning_fog = textstat.gunning_fog(text) ts_text_standard = textstat.text_standard(text) return (ts_flesch_reading_ease, ts_smog_index, ts_flesch_kincaid_grade, ts_coleman_liau_index, ts_automated_readability_index, ts_dale_chall_readability_score, ts_difficult_words, ts_linsear_write_formula, ts_gunning_fog, ts_text_standard)
def lambda_handler(event, context): text = event['text'] response = {} response['flesch_reading_ease'] = textstat.flesch_reading_ease(text) response['smog_index'] = textstat.smog_index(text) response['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text) response['coleman_liau_index'] = textstat.coleman_liau_index(text) response[ 'automated_readability_index'] = textstat.automated_readability_index( text) response[ 'dale_chall_readability_score'] = textstat.dale_chall_readability_score( text) response['difficult_words'] = textstat.difficult_words(text) response['linsear_write_formula'] = textstat.linsear_write_formula(text) response['gunning_fog'] = textstat.gunning_fog(text) response['text_standard'] = textstat.text_standard(text) return respond(None, response)
def get_feat_readability_metrics(self): # https://github.com/shivam5992/textstat try: test_data = self.webscrap.get_body() out = [] out.append(textstat.flesch_reading_ease(test_data)) out.append(textstat.smog_index(test_data)) out.append(textstat.flesch_kincaid_grade(test_data)) out.append(textstat.coleman_liau_index(test_data)) out.append(textstat.automated_readability_index(test_data)) out.append(textstat.dale_chall_readability_score(test_data)) out.append(textstat.difficult_words(test_data)) out.append(textstat.linsear_write_formula(test_data)) out.append(textstat.gunning_fog(test_data)) #out.append(textstat.text_standard(test_data)) return out, False except Exception as e: config.logger.error(repr(e)) return MISSING_FEATURE * 9, True
def feature_readability(essay): syllable_count = textstat.syllable_count(essay) #音节数统计 flesch_reading_ease = textstat.flesch_reading_ease(essay) #文档的易读性0-100之间的分数 smog_index = textstat.smog_index(essay) #烟雾指数,反映文档的易读程度,更精确,更容易计算 flesch_kincaid_index = textstat.flesch_kincaid_grade(essay) #等级分数,年级等级 coleman_liau_index = textstat.coleman_liau_index(essay) #返回文本的年级级别 automated_readability_index = textstat.automated_readability_index(essay) #自动可读性指数,接近理解文本需要的年级 dale_chall_readability_score = textstat.dale_chall_readability_score(essay) #返回年级级别,使用最常见的英文单词 difficult_words = textstat.difficult_words(essay) linsear_write_formula = textstat.linsear_write_formula(essay) #返回文本的年级级别 gunning_fog = textstat.gunning_fog(essay) #迷雾指数, 反映文本的阅读难度 return syllable_count, flesch_reading_ease, smog_index, flesch_kincaid_index, coleman_liau_index, automated_readability_index, dale_chall_readability_score, difficult_words, linsear_write_formula, gunning_fog
def calculate_readability_measures(id): """ Count the words in doc and update the document. """ es = elasticsearch.Elasticsearch() source = es.get_source(index='beek', doc_type='page', id=id) # count = len(source['content'].split()) try: measures = { 'flesch': textstat.flesch_reading_ease(source['content']), 'smog': textstat.smog_index(source['content']), 'flesch_kincaid': textstat.flesch_kincaid_grade(source['content']), 'coleman_liau': textstat.coleman_liau_index(source['content']), 'readability': textstat.automated_readability_index(source['content']), 'dale_chall': textstat.dale_chall_readability_score(source['content']), 'difficult_words': textstat.difficult_words(source['content']), 'linsear_write_formula': textstat.linsear_write_formula(source['content']), 'gunning_fog': textstat.gunning_fog(source['content']), 'consensus': textstat.readability_consensus(source['content']), } es.update(index='beek', doc_type='page', id=id, body={'doc': { 'measures': measures }}, refresh=True) except Exception as err: pass
def calculate_readability_measures(id): """ Count the words in doc and update the document. """ es = elasticsearch.Elasticsearch() source = es.get_source(index='beek', doc_type='page', id=id) # count = len(source['content'].split()) try: measures = { 'flesch': textstat.flesch_reading_ease(source['content']), 'smog': textstat.smog_index(source['content']), 'flesch_kincaid': textstat.flesch_kincaid_grade(source['content']), 'coleman_liau': textstat.coleman_liau_index(source['content']), 'readability': textstat.automated_readability_index(source['content']), 'dale_chall': textstat.dale_chall_readability_score(source['content']), 'difficult_words': textstat.difficult_words(source['content']), 'linsear_write_formula': textstat.linsear_write_formula(source['content']), 'gunning_fog': textstat.gunning_fog(source['content']), 'consensus': textstat.readability_consensus(source['content']), } es.update(index='beek', doc_type='page', id=id, body={'doc': {'measures': measures}}, refresh=True) except Exception as err: pass
def process(data): res = np.array([]) cleaned = data.lower().strip() original = data.strip() fea1 = numOfWords(cleaned) # fea1 = fea1 / 10 fea2 = numOfChar(cleaned) # fea2 = fea2 / 100 fea3 = count(cleaned, string.punctuation) fea5 = numOfContUpperCase(original) fea4 = textstat.gunning_fog(data) fea6 = textstat.automated_readability_index(data) fea7 = textstat.linsear_write_formula(data) fea8 = textstat.difficult_words(data) fea9 = textstat.dale_chall_readability_score(data) fea10 = data.count("\'") + data.count(".") + data.count("\"") + data.count(",") + data.count( "’") + data.count("‘") + data.count("”") + data.count("“") fea10 = (fea10 / len(data)) * 1000 fea11 = data.count("1") + data.count("2") + data.count("3") + data.count("4") + data.count( "5") + data.count("6") + data.count("7") + data.count("8") + data.count("9") + data.count("0") fea12 = data.count("?") + data.count("!") + data.count("@") + data.count("#") + data.count( "$") + data.count("%") + data.count("&") fea13 = data.count(":") + data.count(";") fea14 = data.count("—") + data.count("-") + data.count("_") fea15 = (fea10 / len(data)) * 100 fea16 = data.count("(") + data.count(")") + data.count("[") + data.count("]") + data.count( "{") + data.count("}") fea17 = data.count("*") + data.count("/") fea18 = data.count("?") fea19 = fea10 + fea11 + fea12 + fea13 + fea14 + fea15 + fea16 + fea17 + fea18 res = np.array([[fea1, fea2, fea3, fea5, fea4, fea6, fea7, fea8, fea9, fea10, fea11, fea12, fea13, fea14, fea15, fea16, fea17, fea18, fea19]]) return res
def get_readability(self, corpus, type='ari'): readability = None if type == 'ari': readability = textstat.automated_readability_index(corpus) elif type == 'flesch': readability = textstat.flesch_reading_ease(corpus) elif type == 'smog': readability = textstat.smog_index(corpus) elif type == 'flesch_kinciad': readability = textstat.flesch_kincaid_grade(corpus) elif type == 'coleman': readability = textstat.coleman_liau_index(corpus) elif type == 'dale_chall': readability = textstat.dale_chall_readability_score(corpus) elif type == 'difficult_words': readability = textstat.difficult_words(corpus) elif type == 'linsear': readability = textstat.linsear_write_formula(corpus) elif type == 'gunning_fog': readability = textstat.gunning_fog(corpus) elif type == 'readability_conensus': readability = textstat.readability_consensus(corpus) return readability
def stats(self, text): test_data = text stats = {} stats['flesch_reading_ease'] = textstat.flesch_reading_ease(test_data) stats['smog'] = textstat.smog_index(test_data) stats['flesch kincaid'] = textstat.flesch_kincaid_grade(test_data) stats['coleman Liau'] = textstat.coleman_liau_index(test_data) stats['automated'] = textstat.automated_readability_index(test_data) stats['dale chall'] = textstat.dale_chall_readability_score(test_data) stats['difficult'] = textstat.difficult_words(test_data) stats['linsear'] = textstat.linsear_write_formula(test_data) stats['gunning_fog'] = textstat.gunning_fog(test_data) stats['standard'] = textstat.text_standard(test_data) stats['charcount'] = textstat.char_count(test_data) stats['lexicon count'] = textstat.lexicon_count(test_data) stats['syllable count'] = textstat.syllable_count(test_data) stats['sentence count'] = textstat.sentence_count(test_data) stats['avg sentence length'] = textstat.avg_sentence_length(test_data) stats['avg_syllables_per_word'] = textstat.avg_syllables_per_word( test_data) stats['avg_letter_per_word'] = textstat.avg_letter_per_word(test_data) stats['avg_sentence_per_word'] = textstat.avg_sentence_per_word( test_data) return stats
def __init__(self, path): """ Create document instance for analysis. Opens and reads document to string raw_text. Textract interprets the document format and opens to plain text string (docx, pdf, odt, txt) Args: path (str): path to file to open, anaylze, close Public attributes: -user: (str) optional string to set username. -path: (str) relative path to document. -abs_path: (str) the absolute path to the document. -file_name: (str) the file name with extension of document (base name). -mime: tbd -guessed_type: makes best guess of mimetype of document. -file_type: returns index[0] from guessed_type. -raw_text: (str) plain text extracted from .txt, .odt, .pdf, .docx, and .doc. -ptext: (str) raw text after a series of regex expressions to eliminate special characters. -text_no_feed: (str) ptext with most new line characters eliminated /n/n stays intact. -sentence_tokens: list of all sentences in a comma separated list derived by nltk. -sentence_count: (int) count of sentences found in list. -passive_sentences: list of passive sentences identified by the passive module. -passive_sentence_count: count of the passive_sentences list. -percent_passive: (float) ratio of passive sentences to all sentences in percent form. -be_verb_analysis: (int) sum number of occurrences of each to be verb (am, is, are, was, were, be, being been). -be_verb_count: tbd -be_verb_analysis: tbd -weak_sentences_all: (int) sum of be verb analysis. -weak_sentences_set: (set) set of all sentences identified as having to be verbs. -weak_sentences_count: (int) count of items in weak_sentences_set. -weak_verbs_to_sentences: (float) proportion of sentences with to be to all sentences in percent (this might not be sound). -word_tokens: list of discreet words in text that breaks contractions up (default nltk tokenizer). -word_tokens_no_punct: list of all words in text including contractions but otherwise no punctuation. -no_punct: (str) full text string without sentence punctuation. -word_tokens_no_punct: uses white-space tokenizer to create a list of all words. -readability_flesch_re: (int) Flesch Reading Ease Score (numeric score) made by textstat module. -readability_smog_index: (int) grade level as determined by the SMOG algorithum made by textstat module. -readability_flesch_kincaid_grade: (int) Flesch-Kincaid grade level of reader made by textstat module. -readability_coleman_liau_index: (int) grade level of reader as made by textstat module. -readability_ari: (int) grade leader of reader determined by automated readability index algorithum implemented by textstat. -readability_linser_write: FIX SPELLING grade level as determined by Linsear Write algorithum implemented by textstat. -readability_dale_chall: (int) grade level based on Dale-Chall readability as determined by textstat. -readability_standard: composite grade level based on readability algorithums. -flesch_re_key: list for interpreting Flesch RE Score. -word_count: word count of document based on white space tokener, this word count should be used. -page_length: (float) page length in decimal format given 250 words per page. -paper_count: (int) number of printed pages given 250 words per page. -parts_of_speech: words with parts of speech tags. -pos_counts: values in word, tag couple grouped in a list (Counter). -pos_total: (int) sum of pos_counts values -pos_freq: (dict) word, ratio of whole -doc_pages: (float) page length based on 250 words per page (warning, this is the second time this attribute is defined). -freq_words: word frequency count not standardized based on the correct word tokener (not ratio, just count). modal_dist: count of auxillary verbs based on word_tokens_no_punct. sentence_count (int): Count the sentence tokens passive_sentences (list): List of all sentences identified as passive passive_sentence_count (int): count of items in passive_sentences be_verb_count (int): count "to be" verbs in text word_tokens_no_punct (list): words separated, stripped of punctuation, made lower case flesch_re_key (str): reading ease score to description freq_words (list or dict): frequency distribution of all words modal_dist (list): frequency distribution of aux verbs """ self.user = "" self.path = path self.abs_path = os.path.abspath(self.path) if os.path.isfile(self.path): self.time_stamp = self.timestamp() self.file_name = os.path.basename(path) self.mime = MimeTypes() self.guessed_type = self.mime.guess_type(self.path) self.file_type = self.guessed_type[0] self.raw_text = textract.process(self.path, encoding="ascii") self.ptext = re.sub(u'[\u201c\u201d]', '"', self.raw_text) self.ptext = re.sub(u"\u2014", "--", self.ptext) self.ptext = re.sub(",", ",", self.ptext) self.ptext = re.sub("—", "--", self.ptext) self.ptext = re.sub("…", "...", self.ptext) self.text_no_feed = self.clean_new_lines(self.ptext) self.sentence_tokens = self.sentence_tokenize(self.text_no_feed) self.sentence_count = len(self.sentence_tokens) self.passive_sentences = passive(self.text_no_feed) self.passive_sentence_count = len(self.passive_sentences) self.percent_passive = (100 * (float(self.passive_sentence_count) / float(self.sentence_count))) self.percent_passive_round = round(self.percent_passive, 2) self.be_verb_analysis = self.count_be_verbs(self.sentence_tokens) self.be_verb_count = self.be_verb_analysis[0] self.weak_sentences_all = self.be_verb_analysis[1] self.weak_sentences_set = set(self.weak_sentences_all) self.weak_sentences_count = len(self.weak_sentences_set) self.weak_verbs_to_sentences = 100 * float( self.weak_sentences_count) / float(self.sentence_count) self.weak_verbs_to_sentences_round = round( self.weak_verbs_to_sentences, 2) self.word_tokens = self.word_tokenize(self.text_no_feed) self.word_tokens_no_punct = \ self.word_tokenize_no_punct(self.text_no_feed) self.no_punct = self.strip_punctuation(self.text_no_feed) # use this! It make lower and strips symbols self.word_tokens_no_punct = self.ws_tokenize(self.no_punct) self.readability_flesch_re = \ textstat.flesch_reading_ease(self.text_no_feed) self.readability_smog_index = \ textstat.smog_index(self.text_no_feed) self.readability_flesch_kincaid_grade = \ textstat.flesch_kincaid_grade(self.text_no_feed) self.readability_coleman_liau_index = \ textstat.coleman_liau_index(self.text_no_feed) self.readability_ari = \ textstat.automated_readability_index(self.text_no_feed) self.readability_linser_write = \ textstat.linsear_write_formula(self.text_no_feed) self.readability_dale_chall = \ textstat.dale_chall_readability_score(self.text_no_feed) self.readability_standard = \ textstat.text_standard(self.text_no_feed) self.flesch_re_desc_str = self.flesch_re_desc( int(textstat.flesch_reading_ease(self.text_no_feed))) self.polysyllabcount = textstat.polysyllabcount(self.text_no_feed) self.lexicon_count = textstat.lexicon_count(self.text_no_feed) self.avg_syllables_per_word = textstat.avg_syllables_per_word( self.text_no_feed) self.avg_sentence_per_word = textstat.avg_sentence_per_word( self.text_no_feed) self.avg_sentence_length = textstat.avg_sentence_length( self.text_no_feed) self.avg_letter_per_word = textstat.avg_letter_per_word( self.text_no_feed) self.difficult_words = textstat.difficult_words(self.text_no_feed) self.rand_passive = self.select_random(self.passive_sentence_count, self.passive_sentences) self.rand_weak_sentence = self.select_random( len(self.weak_sentences), self.weak_sentences) if self.word_tokens_no_punct: self.word_count = len(self.word_tokens_no_punct) self.page_length = float(self.word_count) / float(250) self.paper_count = int(math.ceil(self.page_length)) self.parts_of_speech = pos_tag(self.word_tokens_no_punct) self.pos_counts = Counter( tag for word, tag in self.parts_of_speech) self.pos_total = sum(self.pos_counts.values()) self.pos_freq = dict( (word, float(count) / self.pos_total) for word, count in self.pos_counts.items()) self.doc_pages = float(float(self.word_count) / float(250)) self.freq_words = \ self.word_frequency(self.word_tokens_no_punct) self.modal_dist = self.modal_count(self.word_tokens_no_punct) # self.ws_tokens = self.ws_tokenize(self.text_no_cr) self.pos_count_dict = self.pos_counts.items() # Model - use for any pos self.modals = self.pos_isolate('MD', self.pos_count_dict) self.preposition_count = self.pos_isolate('IN', self.pos_count_dict) self.adjective_count = self.pos_isolate_fuzzy( 'JJ', self.pos_count_dict) self.adverb_count = self.pos_isolate_fuzzy('RB', self.pos_count_dict) self.proper_nouns = self.pos_isolate_fuzzy('NNP', self.pos_count_dict) self.cc_count = self.pos_isolate('CC', self.pos_count_dict) self.commas = self.char_count(",") self.comma_sentences = self.list_sentences(",") self.comma_example = self.select_random(len(self.comma_sentences), self.comma_sentences) self.semicolons = self.char_count(";") self.semicolon_sentences = self.list_sentences(";") self.semicolon_example = self.select_random( len(self.semicolon_sentences), self.semicolon_sentences) self.lint_suggestions = lint(self.raw_text)
sf['FRE_text'] = sf['content'].apply(lambda x: textstat.flesch_reading_ease(x)) sf['FRE_tagged_text'] = sf['FRE_text'].apply( lambda x: 1 if x < 100 and x >= 90 else 2 if x < 90 and x >= 80 else 3 if x < 80 and x >= 70 else 4 if x < 70 and x >= 60 else 5 if x < 60 and x >= 50 else 6 if x < 50 and x >= 30 else 7) sf['FK_text'] = sf['content'].apply( lambda x: int(textstat.flesch_kincaid_grade(x))) sf['GFI_text'] = sf['content'].apply(lambda x: textstat.gunning_fog(x)) sf['SMI_text'] = sf['content'].apply(lambda x: textstat.smog_index(x)) sf['CLI_text'] = sf['content'].apply(lambda x: textstat.coleman_liau_index(x)) sf['ARI_text'] = sf['content'].apply( lambda x: int(textstat.automated_readability_index(x))) sf['DC_text'] = sf['content'].apply( lambda x: textstat.dale_chall_readability_score(x)) sf['Difficult_text_wc'] = sf['content'].apply( lambda x: textstat.difficult_words(x)) #Hand-picked quantitative features - # of percentage occurrences percent_pattern = re.compile('((?:|0|[1-9]\d\d?)(?:\.\d{1,3})?)%') sf['Percent_occurrences'] = sf['content'].apply( lambda x: len(percent_pattern.findall(x))) #Polarity feature extraction from news headlines sf['Polarity_head'] = sf['title'].apply( lambda x: lm.get_score(lm.tokenize(x))['Polarity']) sf['Subjectivity_head'] = sf['title'].apply( lambda x: lm.get_score(lm.tokenize(x))['Subjectivity']) sf['Positive_head_wc'] = sf['title'].apply( lambda x: lm.get_score(lm.tokenize(x))['Positive']) sf['Negative_head_wc'] = sf['title'].apply( lambda x: lm.get_score(lm.tokenize(x))['Negative'])
def predict_relevance(df): #Loading data into SFrame df[[a for a in df.columns.values]] = df[[a for a in df.columns.values ]].astype(str) tf = gl.SFrame(data=df) tf = tf.unique() #Loading LDA model for topic modeling, pysentiment module for financial sentiment analysis and the relevance prediction model lda = models.ldamodel.LdaModel.load('lda1.model') lm = py.LM() model = gl.load_model('relevance_model_64feat') #Building the LDA model using news articles tf['tokens'] = tf['content'].apply(lambda x: dc.tokenize_doc(x, 'STEM')) tokens_text = [ unicode('|'.join(i), errors='replace').split('|') for i in tf['tokens'] ] dictionary = corpora.Dictionary(tokens_text) corpus = [dictionary.doc2bow(text) for text in tokens_text] ldamat = lda[corpus] #Building LDA topic arrays per topic topic_arrays = np.zeros((30, len(ldamat))) for i, x in enumerate(ldamat): for topic_no, contrib in x: topic_arrays[topic_no, i] = contrib #Adding LDA topic arrays as feature columns as 'Tx' for i, x in enumerate(topic_arrays): tf['T' + str(i)] = gl.SArray(data=x, dtype=float) #Polarity feature extraction from content of news articles tf['Polarity_text'] = tf['content'].apply( lambda x: lm.get_score(lm.tokenize(x))['Polarity']) tf['Subjectivity_text'] = tf['content'].apply( lambda x: lm.get_score(lm.tokenize(x))['Subjectivity']) tf['Positive_text_wc'] = tf['content'].apply( lambda x: lm.get_score(lm.tokenize(x))['Positive']) tf['Negative_text_wc'] = tf['content'].apply( lambda x: lm.get_score(lm.tokenize(x))['Negative']) tf['Total_text_wc'] = tf['content'].apply(lambda x: len(lm.tokenize(x))) tf['Negative_text_rate'] = tf['Negative_text_wc'] / tf['Total_text_wc'] tf['Positive_text_rate'] = tf['Positive_text_wc'] / tf['Total_text_wc'] tf['Max_Polarity'] = tf['content'].apply(lambda x: max( [lm.get_score(lm.tokenize(y))['Polarity'] for y in sent_tokenize(x)])) tf['Min_Polarity'] = tf['content'].apply(lambda x: min( [lm.get_score(lm.tokenize(y))['Polarity'] for y in sent_tokenize(x)])) tf['Sentences_wc'] = tf['content'].apply(lambda x: len(sent_tokenize(x))) tf['Positive_sentrate'] = tf['Positive_text_wc'] / tf['Sentences_wc'] tf['Negative_sentrate'] = tf['Negative_text_wc'] / tf['Sentences_wc'] #Readability feature extraction from content of news articles tf['FRE_text'] = tf['content'].apply( lambda x: textstat.flesch_reading_ease(x)) tf['FRE_tagged_text'] = tf['FRE_text'].apply( lambda x: 1 if x < 100 and x >= 90 else 2 if x < 90 and x >= 80 else 3 if x < 80 and x >= 70 else 4 if x < 70 and x >= 60 else 5 if x < 60 and x >= 50 else 6 if x < 50 and x >= 30 else 7) tf['FK_text'] = tf['content'].apply( lambda x: int(textstat.flesch_kincaid_grade(x))) tf['GFI_text'] = tf['content'].apply(lambda x: textstat.gunning_fog(x)) tf['SMI_text'] = tf['content'].apply(lambda x: textstat.smog_index(x)) tf['CLI_text'] = tf['content'].apply( lambda x: textstat.coleman_liau_index(x)) tf['ARI_text'] = tf['content'].apply( lambda x: int(textstat.automated_readability_index(x))) tf['DC_text'] = tf['content'].apply( lambda x: textstat.dale_chall_readability_score(x)) tf['Difficult_text_wc'] = tf['content'].apply( lambda x: textstat.difficult_words(x)) #Hand-picked quantitative features - # of percentage occurrences percent_pattern = re.compile('((?:|0|[1-9]\d\d?)(?:\.\d{1,3})?)%') tf['Percent_occurrences'] = tf['content'].apply( lambda x: len(percent_pattern.findall(x))) #Polarity feature extraction from news headlines tf['Polarity_head'] = tf['title'].apply( lambda x: lm.get_score(lm.tokenize(x))['Polarity']) tf['Subjectivity_head'] = tf['title'].apply( lambda x: lm.get_score(lm.tokenize(x))['Subjectivity']) tf['Positive_head_wc'] = tf['title'].apply( lambda x: lm.get_score(lm.tokenize(x))['Positive']) tf['Negative_head_wc'] = tf['title'].apply( lambda x: lm.get_score(lm.tokenize(x))['Negative']) tf['Total_head_wc'] = tf['title'].apply(lambda x: len(lm.tokenize(x))) tf['Negative_head_rate'] = tf['Negative_head_wc'] / tf['Total_head_wc'] tf['Positive_head_rate'] = tf['Positive_head_wc'] / tf['Total_head_wc'] #Readability feature extraction from news headlines tf['FRE_head'] = tf['title'].apply( lambda x: textstat.flesch_reading_ease(x)) tf['FRE_tagged_head'] = tf['FRE_head'].apply( lambda x: 1 if x < 100 and x >= 90 else 2 if x < 90 and x >= 80 else 3 if x < 80 and x >= 70 else 4 if x < 70 and x >= 60 else 5 if x < 60 and x >= 50 else 6 if x < 50 and x >= 30 else 7) tf['FK_head'] = tf['title'].apply( lambda x: int(textstat.flesch_kincaid_grade(x))) tf['GFI_head'] = tf['title'].apply(lambda x: textstat.gunning_fog(x)) tf['SMI_head'] = tf['title'].apply(lambda x: textstat.smog_index(x)) tf['CLI_head'] = tf['title'].apply( lambda x: textstat.coleman_liau_index(x)) tf['ARI_head'] = tf['title'].apply( lambda x: int(textstat.automated_readability_index(x))) tf['DC_head'] = tf['title'].apply( lambda x: textstat.dale_chall_readability_score(x)) tf['Difficult_head_wc'] = tf['title'].apply( lambda x: textstat.difficult_words(x)) #Predicting relevance class using these features in sorted order of confidence tf = tf.add_row_number() pred = model.classify(tf) pred = pred.add_row_number() relevant = pred.sort('probability', ascending=False)[:10] relevant = pred[pred['class'] == 1] non_relevant = pred[pred['class'] == 0] if relevant.num_rows() > 10: relevant_news_out = tf.join(relevant).sort('probability', ascending=False)[:10] else: relevant_news = relevant.sort('probability', ascending=False) req_num_non_relevant_news = 10 - relevant.num_rows() non_relevant_news = non_relevant.sort( 'probability')[:req_num_non_relevant_news] relevant_news = relevant_news.append(non_relevant_news) relevant_news_out = tf.join(relevant_news) return relevant_news_out
training_essays_df.set_value( i, "flesch_kincaid_grade", textstat.flesch_kincaid_grade(training_essays_df.iloc[i]['essay'])) training_essays_df.set_value( i, "coleman_liau_index", textstat.coleman_liau_index(training_essays_df.iloc[i]['essay'])) training_essays_df.set_value( i, "automated_readability_index", textstat.automated_readability_index( training_essays_df.iloc[i]['essay'])) training_essays_df.set_value( i, "dale_chall_readability_score", textstat.dale_chall_readability_score( training_essays_df.iloc[i]['essay'])) training_essays_df.set_value( i, "difficult_words", textstat.difficult_words(training_essays_df.iloc[i]['essay'])) training_essays_df.set_value( i, "linsear_write_formula", textstat.linsear_write_formula(training_essays_df.iloc[i]['essay'])) training_essays_df.set_value( i, "gunning_fog", textstat.gunning_fog(training_essays_df.iloc[i]['essay'])) training_essays_df.set_value( i, "text_standard", textstat.text_standard(training_essays_df.iloc[i]['essay'])) writer = pd.ExcelWriter(sys.argv[2]) training_essays_df.to_excel(writer, sheet_name="results", index=False) writer.save()
# Build Dataset try: cur = { "title": title, "artist": artist, "year": year, "pos": pos, "lyrics": lyrics, "tags": get_tags(artist), "sentiment": sent_analyzer.polarity_scores(lyrics_repl), "f_k_grade": ts.flesch_kincaid_grade(lyrics_repl), "flesch_index": ts.flesch_reading_ease(lyrics_repl), "fog_index": ts.gunning_fog(lyrics_repl), "difficult_words": ts.difficult_words(lyrics_repl), "num_syllables": ts.syllable_count(lyrics_repl), "num_words": ts.lexicon_count(lyrics_repl, True), "num_lines": ts.sentence_count(lyrics_repl), "num_dupes": count_dupes(lyrics) } # print cur dataset.append(cur) except Exception, e: print e except Exception, e: print "Exception occurred for " + artist + ' - ' + title print e outfile = "years/" + str(year) + '.txt'
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Dec 31 15:25:10 2016 @author: megan """ from textstat.textstat import textstat as ts fname = 'actbacFB.txt' with open(fname, 'r', encoding='utf-8') as f: data = f.read().replace('\n', '') total = ts.lexicon_count(data) difficult = ts.difficult_words(data) fkre = ts.flesch_reading_ease(data) grade = ts.flesch_kincaid_grade(data) overall = ts.text_standard(data) print("Total words:", total) print("Difficult words:", difficult) print("FKRE:", fkre) print("Grade:", grade) print("Overall readability", overall)
# "great booking book a hotel in ibiza." # test_data = "refund hotel in ewr of george area area nearby. 3-star belfast hotel in" test_data = "great rates. book at western blue casino hotel, bangkok. no reservation costs. great" print( "-------------------------Text Statistic-----------------------------------" ) print("Returns the number of syllables present in the given text.") print(textstat.syllable_count(test_data, lang='en_US')) print( "Calculates the number of words present in the text - punctuation removed") print(textstat.lexicon_count(test_data, removepunct=True)) print("Returns the number of sentences present in the given text.") print(textstat.sentence_count(test_data)) print("difficult words") print(textstat.difficult_words(test_data)) print( "-------------------------Readability Formula------------------------------" ) print("The Flesch Reading Ease Score") print(textstat.flesch_reading_ease(test_data)) print("The SMOG Index") print("Texts of fewer than 30 sentences are statistically invalid, " "because the SMOG formula was normed on 30-sentence samples.") print("textstat requires atleast 3 sentences for a result.") print(textstat.smog_index(test_data)) print("The Flesch-Kincaid Grade") print(textstat.flesch_kincaid_grade(test_data)) print("The Coleman-Liau Index") print(textstat.coleman_liau_index(test_data)) print("Automated Readability Index (ARI)")
def test_difficult_words(self): result = textstat.difficult_words(self.long_test) self.assertEqual(62, result)
# print(textstat.syllable_count(test_data, lang='en_US')) num_syllables = textstat.syllable_count(test_data, lang='en_US') print(num_syllables) print( "Calculates the number of words present in the text - punctuation removed" ) # print(textstat.lexicon_count(test_data, removepunct=True)) num_words = textstat.lexicon_count(test_data, removepunct=True) print(num_words) print("Returns the number of sentences present in the given text.") # print(textstat.sentence_count(test_data)) num_sentences = textstat.sentence_count(test_data) print(num_sentences) print("difficult words") # print(textstat.difficult_words(test_data)) num_difficult_words = textstat.difficult_words(test_data) print(num_difficult_words) print( "-------------------------Difficulty------------------------------" ) print("The Flesch Reading Ease Score") # print(textstat.flesch_reading_ease(test_data)) difficulty_score = textstat.flesch_reading_ease(test_data) print(difficulty_score) if 0 <= difficulty_score < 30: difficulty_label = "Confusing" elif 30 <= difficulty_score < 50: difficulty_label = "Difficult" elif 50 <= difficulty_score < 60:
target.close() # Build Dataset try: cur = { "title": title, "artist": artist, "year": year, "pos": pos, "lyrics": lyrics, "tags": get_tags(artist), "sentiment": sent_analyzer.polarity_scores(lyrics_repl), "f_k_grade": ts.flesch_kincaid_grade(lyrics_repl), "flesch_index": ts.flesch_reading_ease(lyrics_repl), "fog_index": ts.gunning_fog(lyrics_repl), "difficult_words": ts.difficult_words(lyrics_repl), "num_syllables": ts.syllable_count(lyrics_repl), "num_words": ts.lexicon_count(lyrics_repl, True), "num_lines": ts.sentence_count(lyrics_repl), "num_dupes": count_dupes(lyrics) } # print cur dataset.append(cur) except Exception, e: print e except Exception, e: print "Exception occurred for " + artist + ' - ' + title print e outfile = "years/" + str(year) + '.txt'
#!/bin/python import sys, string, os from textstat.textstat import textstat inputfile = '' test_data = "" script_name = sys.argv[0] inputfile = sys.argv[1] with open(inputfile) as myfile: test_data="".join(line.rstrip() for line in myfile) var1 = str(textstat.flesch_reading_ease(test_data)) var2 = str(textstat.smog_index(test_data)) var3 = str(textstat.flesch_kincaid_grade(test_data)) var4 = str(textstat.coleman_liau_index(test_data)) var5 = str(textstat.automated_readability_index(test_data)) var6 = str(textstat.dale_chall_readability_score(test_data)) var7 = str(textstat.difficult_words(test_data)) var8 = str(textstat.linsear_write_formula(test_data)) var9 = str(textstat.gunning_fog(test_data)) var10 = str(textstat.readability_consensus(test_data)) var11 = str(textstat.syllable_count(test_data)) var12 = str(textstat.lexicon_count(test_data, 1)) var13 = str(textstat.sentence_count(test_data)) print(var1 + ',' + var2 + ',' + var3 + ',' + var4 + ',' + var5 + ',' + var6 + ',' + var7 + ',' + var8 + ',' + var9 + ',' + var10 + ',' + var11 + ',' + var12 + ',' + var13)
import re raw_input("Please copy the lyrics to the two text files song1 and song 2. \nWhen complete hit enter to analyze.") print "" try: f = open('song1.txt') f_read = str(f.read()) cleaned = re.sub("[\(\[].*?[\)\]]", "", f_read) if textstat.dale_chall_readability_score(cleaned) < 5: print "Song #1 | Dale Chall Score: " + str(textstat.dale_chall_readability_score(cleaned)) print "Song #1 | " + "Easily understood by 4th-grade students or lower." f.close() elif textstat.dale_chall_readability_score(cleaned) < 6: print "Song #1 | Dale-Chall Score: " + str(textstat.dale_chall_readability_score(cleaned)) print "Song #1 | # of Difficult Words: " + str(textstat.difficult_words(cleaned)) print "Song #1 | " + "Easily understood by 5th-grade and 6th-grade students." f.close() elif textstat.dale_chall_readability_score(cleaned) < 7: print "Song #1 | Dale-Chall Score: " + str(textstat.dale_chall_readability_score(cleaned)) print "Song #1 | # of Difficult Words: " + str(textstat.difficult_words(cleaned)) print "Song #1 | " + "Easily understood by 7th-grade and 8th-grade students." f.close() elif textstat.dale_chall_readability_score(cleaned) < 8: print "Song #1 | Dale-Chall Score: " + str(textstat.dale_chall_readability_score(cleaned)) print "Song #1 | # of Difficult Words: " + str(textstat.difficult_words(cleaned)) print "Song #1 | " + "Easily understood by 9th-grade and 10th-grade students." f.close() elif textstat.dale_chall_readability_score(cleaned) < 9: print "Song #1 | Dale-Chall Score: " + str(textstat.dale_chall_readability_score(cleaned)) print "Song #1 | # of Difficult Words: " + str(textstat.difficult_words(cleaned))
if os.path.isdir(FILE_OR_DIR): FILES = [os.path.join(FILE_OR_DIR, fn) for fn in os.listdir(FILE_OR_DIR)] else: FILES = [FILE_OR_DIR] for FILE in FILES: print 'Processing', FILE TEXT = read_file(FILE) print 'Flesh reading ease', textstat.flesch_reading_ease(TEXT) print 'Smog index', textstat.smog_index(TEXT) print 'Flesch Kincaid grade', textstat.flesch_kincaid_grade(TEXT) print 'Coleman Liau', textstat.flesch_kincaid_grade(TEXT) print 'Automated readability index', textstat.automated_readability_index(TEXT) print 'Dale Chall readability score', textstat.dale_chall_readability_score(TEXT) print 'Difficult words', textstat.difficult_words(TEXT) print 'Linsear write formula', textstat.linsear_write_formula(TEXT) print 'Gunning fog', textstat.gunning_fog(TEXT) print 'Text standard', textstat.text_standard(TEXT) print '\nWords' WORDS = get_words(TEXT) get_word_stats(WORDS) print '\nWords no Stop Words' WORDS_NO_STOP = [w for w in WORDS if w not in stop] get_word_stats(WORDS_NO_STOP) print '\nSentences' SENTENCES = get_sentences(TEXT) get_sentence_stats(SENTENCES)
def percent_difficult_words(article): if textstat.lexicon_count(article) == 0: return 0 return textstat.difficult_words(article) / textstat.lexicon_count(article)
def oov_words_diff(q1, q2): return textstat.difficult_words(q1) - textstat.difficult_words(q2)
'../output_text/trump_out.txt', '../output_text/shakespeare_out.txt', '../output_text/drseuss_out.txt' ] # input_file_names = ['../data_parsed/trump.txt', input_file_names = [ '../data_parsed/shakespeare.txt', '../data_parsed/drseuss.txt' ] for i in range(0, len(input_file_names)): input_file_name = input_file_names[i] print(input_file_name) with open(input_file_name, 'r') as myfile: test_data = myfile.read().replace('\n', '') print "flesch_reading_ease: " + str( textstat.flesch_reading_ease(test_data)) print "smog_index: " + str(textstat.smog_index(test_data)) print "flesch_kincaid_grade: " + str( textstat.flesch_kincaid_grade(test_data)) print "coleman_liau_index: " + str(textstat.coleman_liau_index(test_data)) print "automated_readability_index: " + str( textstat.automated_readability_index(test_data)) print "dale_chall_readability_score: " + str( textstat.dale_chall_readability_score(test_data)) print "difficult_words: " + str(textstat.difficult_words(test_data)) print "linsear_write_formula: " + str( textstat.linsear_write_formula(test_data)) print "gunning_fog: " + str(textstat.gunning_fog(test_data)) print "text_standard: " + str(textstat.text_standard(test_data))