def do_text_stats(self, text): ### Syllable Count syllable_count = textstat.syllable_count(text) ### Lexicon Count lexicon_count = textstat.lexicon_count(text, True) ### Sentence Count sentence_count = textstat.sentence_count(text) ### The Flesch Reading Ease formula try: flesch_reading_ease = textstat.flesch_reading_ease(text) except TypeError as e: flesch_reading_ease = None #* 90-100 : Very Easy #* 80-89 : Easy #* 70-79 : Fairly Easy #* 60-69 : Standard #* 50-59 : Fairly Difficult #* 30-49 : Difficult #* 0-29 : Very Confusing ### The The Flesch-Kincaid Grade Level try: flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) except TypeError as e: flesch_kincaid_grade = None ## The Fog Scale (Gunning FOG Formula) gunning_fog = textstat.gunning_fog(text) ### The SMOG Index smog_index = textstat.smog_index(text) ### Automated Readability Index automated_readability_index = textstat.automated_readability_index( text) ### The Coleman-Liau Index try: coleman_liau_index = textstat.coleman_liau_index(text) except TypeError as e: coleman_liau_index = None ### Linsear Write Formula linsear_write_formula = textstat.linsear_write_formula(text) ### Dale-Chall Readability Score dale_chall_readability_score = textstat.dale_chall_readability_score( text) ### Readability Consensus based upon all the above tests try: text_standard = textstat.text_standard(text) except TypeError as e: text_standard = None return { "syllable_count": syllable_count, "lexicon_count": lexicon_count, "sentence_count": sentence_count, "flesch_reading_ease": flesch_reading_ease, "flesch_kincaid_grade": flesch_kincaid_grade, "gunning_fog": gunning_fog, "smog_index": smog_index, "automated_readability_index": automated_readability_index, "coleman_liau_index": coleman_liau_index, "linsear_write_formula": linsear_write_formula, "dale_chall_readability_score": dale_chall_readability_score, "text_standard": text_standard }
def readability(text): print("Readability\n=================================\n\n") print("Flesch Reading Ease\n________________________\n\n") print str(textstat.flesch_reading_ease(text)) + "\n" print("Smog Index\n________________________\n\n") print str(textstat.smog_index(text)) + "\n" print("Flesch Kincaid Grade\n________________________\n\n") print str(textstat.flesch_kincaid_grade(text)) + "\n" print("Coleman Liau Index\n________________________\n\n") print str(textstat.coleman_liau_index(text)) + "\n" print("ARI\n________________________\n\n") print str(textstat.automated_readability_index(text)) + "\n" print("Dale Chall\n________________________\n\n") print str(textstat.dale_chall_readability_score(text)) + "\n" print("Difficult Words\n________________________\n\n") print str(textstat.difficult_words(text)) + "\n" print("Linsear Write Formula\n________________________\n\n") print str(textstat.linsear_write_formula(text)) + "\n" print("Gunning Fog\n________________________\n\n") print str(textstat.gunning_fog(text)) + "\n" print "Compiled Score\n_____________________________\n\n" print str(textstat.text_standard(text)) + "\n" return len(adjectives)
def get_readability(df2): df = df2.copy() text_feats = df.select_dtypes(include=['object']).columns.values for i, col in enumerate(text_feats): df['flesch_reading_ease{}'.format(i)] = df[col].apply( lambda x: textstat.flesch_reading_ease(x)) df['smog_index{}'.format(i)] = df[col].apply( lambda x: textstat.smog_index(x)) df['flesch_kincaid_grade{}'.format(i)] = df[col].apply( lambda x: textstat.flesch_kincaid_grade(x)) df['coleman_liau_index{}'.format(i)] = df[col].apply( lambda x: textstat.coleman_liau_index(x)) df['automated_readability_index{}'.format(i)] = df[col].apply( lambda x: textstat.automated_readability_index(x)) df['dale_chall_readability_score{}'.format(i)] = df[col].apply( lambda x: textstat.dale_chall_readability_score(x)) df['difficult_words{}'.format(i)] = df[col].apply( lambda x: textstat.difficult_words(x)) df['linsear_write_formula{}'.format(i)] = df[col].apply( lambda x: textstat.linsear_write_formula(x)) df['gunning_fog{}'.format(i)] = df[col].apply( lambda x: textstat.gunning_fog(x)) df['text_standard{}'.format(i)] = df[col].apply( lambda x: textstat.text_standard(x)) return df
def get_description_composite_grade_level(self): """Calculates the grade level of the repository's description using a variety of measures in textstat. """ if self.description: return textstat.text_standard(self.description) else: return None
def get_readme_composite_grade_level(self): """Calculates the grade level of the repository's readme using a variety of measures in textstat. """ if self.readme: return textstat.text_standard(self.readme) else: return None
def main(): conn = psycopg2.connect(**db.config()) read = conn.cursor() read.execute('select season, episode, string_agg(text, \' \') from scene_text group by season, episode order by season, episode') row = read.fetchone() while row is not None: text = row[2] text_standard = textstat.text_standard(text, True) print(row[0], row[1], text_standard) row = read.fetchone()
def complexityAlongtheText(text: str, chunk_length: int = 5) -> Union[float, float, str]: words = sent_tokenize(text) cur = 0 stds = [] hardest_chunk_index = 0 while cur < len(words): sub = words[cur:cur + 5] sub_text = " ".join(sub) std = textstat.text_standard(sub_text, float_output=True) cur += chunk_length if std > hardest_chunk_index: hardest_chunk_index = cur stds.append(std) hard_snippet = words[hardest_chunk_index:hardest_chunk_index + chunk_length] hs = "" for h in hard_snippet: hs += h + str(" ") return np.mean(stds), textstat.text_standard(text, float_output=True), hs
def main(): conn = psycopg2.connect(**db.config()) read = conn.cursor() read.execute('select * from all_lines_by_char where char_name in (select char_name from total_lines_by_char limit 25)') row = read.fetchone() while row is not None: text = row[1] text_standard = textstat.text_standard(text, True) print(row[0], text_standard) row = read.fetchone()
def _is_readable(phrase: str) -> bool: """ Checks if a given phrase is readable :param phrase: The string to check :return: True if readable, false if not """ textstat.set_lang("en") score = textstat.text_standard(phrase, float_output=True) if score > 6: return True else: return False
def get_stats(sentence): syllables = textstat.syllable_count(sentence) words = textstat.lexicon_count(sentence, True) sentence_count = textstat.sentence_count(sentence) if sentence_count > 0: text_standard = textstat.text_standard(sentence) else: text_standard = EMPTY_TEXT_STANDARD text_standard = fix_grammar_errors(text_standard) return combine(syllables, words, sentence_count, text_standard)
def main(): conn = psycopg2.connect(**db.config()) read = conn.cursor() read.execute( 'select season, episode, string_agg(text, \' \') from scene_text group by season, episode order by season, episode' ) row = read.fetchone() while row is not None: text = row[2] text_standard = textstat.text_standard(text, True) print(row[0], row[1], text_standard) row = read.fetchone()
def get_readability(contents): readability = [] readability.append(textstat.flesch_reading_ease(contents)) readability.append(textstat.smog_index(contents)) readability.append(textstat.flesch_kincaid_grade(contents)) readability.append(textstat.automated_readability_index(contents)) readability.append(textstat.dale_chall_readability_score(contents)) readability.append(textstat.difficult_words(contents)) readability.append(textstat.linsear_write_formula(contents)) readability.append(textstat.gunning_fog(contents)) readability.append(textstat.coleman_liau_index(contents)) readability.append(textstat.text_standard(contents)) return readability
def analyse_json(json_text): # consider moving this to be a feature of Transcript in the other module df_witnesses = pd.DataFrame(columns=['html_file_location', 'witness_name', 'syllable_count','lexicon_count', 'sentence_count', 'syllables_per_word', 'gunning_fog', 'smog_index', 'text_standard'], index=[]) trscrpt = json.loads(json_text) if 'witnesses' in trscrpt: witnesses = trscrpt['witnesses'] for s in trscrpt['all_sections']: if 'speaker' in s and 'person' in s['speaker'] and \ s['speaker']['person']['speaker_type']=='witness': witness = witnesses[s['speaker']['person']['name']] witness.setdefault('all_text', []).append(s['spoken_text']) for i, p in enumerate(witnesses): if 'all_text' in witnesses[p]: witness_text = '\n\n'.join(witnesses[p]['all_text']) if len(witness_text) > 0: stats_data = {'html_file_location': trscrpt['html_file_location'], 'witness_name': p, 'syllable_count': textstat.syllable_count(witness_text), 'lexicon_count': textstat.lexicon_count(witness_text), 'sentence_count': textstat.sentence_count(witness_text), 'syllables_per_word': textstat.avg_syllables_per_word(witness_text), 'gunning_fog': textstat.gunning_fog(witness_text), 'smog_index': textstat.smog_index(witness_text), 'text_standard': textstat.text_standard(witness_text)} df_witnesses.loc['witness_%i' % i] = stats_data else: df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location'] df_witnesses.loc['witness_%i' % i, 'witness_name'] = p else: df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location'] df_witnesses.loc['witness_%i' % i, 'witness_name'] = p return df_witnesses
def run_textstat(text): #text = """Playing games has always been thought to be important to the development of well-balanced and creative children; however, what part, if any, they should play in the lives of adults has never been researched that deeply. I believe that playing games is every bit as important for adults as for children. Not only is taking time out to play games with our children and other adults valuable to building interpersonal relationships but is also a wonderful way to release built up tension.""" ts_flesch_reading_ease = textstat.flesch_reading_ease(text) ts_smog_index = textstat.smog_index(text) ts_flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) ts_coleman_liau_index = textstat.coleman_liau_index(text) ts_automated_readability_index = textstat.automated_readability_index(text) ts_dale_chall_readability_score = textstat.dale_chall_readability_score( text) ts_difficult_words = textstat.difficult_words(text) ts_linsear_write_formula = textstat.linsear_write_formula(text) ts_gunning_fog = textstat.gunning_fog(text) ts_text_standard = textstat.text_standard(text) return (ts_flesch_reading_ease, ts_smog_index, ts_flesch_kincaid_grade, ts_coleman_liau_index, ts_automated_readability_index, ts_dale_chall_readability_score, ts_difficult_words, ts_linsear_write_formula, ts_gunning_fog, ts_text_standard)
def lambda_handler(event, context): text = event['text'] response = {} response['flesch_reading_ease'] = textstat.flesch_reading_ease(text) response['smog_index'] = textstat.smog_index(text) response['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text) response['coleman_liau_index'] = textstat.coleman_liau_index(text) response[ 'automated_readability_index'] = textstat.automated_readability_index( text) response[ 'dale_chall_readability_score'] = textstat.dale_chall_readability_score( text) response['difficult_words'] = textstat.difficult_words(text) response['linsear_write_formula'] = textstat.linsear_write_formula(text) response['gunning_fog'] = textstat.gunning_fog(text) response['text_standard'] = textstat.text_standard(text) return respond(None, response)
def stats(self, text): test_data = text stats = {} stats['flesch_reading_ease'] = textstat.flesch_reading_ease(test_data) stats['smog'] = textstat.smog_index(test_data) stats['flesch kincaid'] = textstat.flesch_kincaid_grade(test_data) stats['coleman Liau'] = textstat.coleman_liau_index(test_data) stats['automated'] = textstat.automated_readability_index(test_data) stats['dale chall'] = textstat.dale_chall_readability_score(test_data) stats['difficult'] = textstat.difficult_words(test_data) stats['linsear'] = textstat.linsear_write_formula(test_data) stats['gunning_fog'] = textstat.gunning_fog(test_data) stats['standard'] = textstat.text_standard(test_data) stats['charcount'] = textstat.char_count(test_data) stats['lexicon count'] = textstat.lexicon_count(test_data) stats['syllable count'] = textstat.syllable_count(test_data) stats['sentence count'] = textstat.sentence_count(test_data) stats['avg sentence length'] = textstat.avg_sentence_length(test_data) stats['avg_syllables_per_word'] = textstat.avg_syllables_per_word( test_data) stats['avg_letter_per_word'] = textstat.avg_letter_per_word(test_data) stats['avg_sentence_per_word'] = textstat.avg_sentence_per_word( test_data) return stats
#main script if __name__ == '__main__': print "TextStat Comparison Script" print "--------------------------" #read in text from the command line #This needs to be fixed to deal/escape special characters textToCheck = raw_input("Please enter the text you would like to analyse: ") #read in text from a file- but what format? print "\n\n" print "Results" print "==============================================" print "==============================================\n" print "Syllable Count: " + str(textstat.syllable_count(textToCheck)) print "Lexicon Count: " + str(textstat.lexicon_count(textToCheck)) #TRUE is default and removes punctuation before counting print "Sentence Count: " + str(textstat.sentence_count(textToCheck)) print "Flesch Reading Ease formula: " + str(textstat.flesch_reading_ease(textToCheck)) print "Flesch-Kincaid Grade Level: " + str(textstat.flesch_kincaid_grade(textToCheck)) print "Fog Scale (Gunning FOG Formula): " + str(textstat.gunning_fog(textToCheck)) print "SMOG Index: " + str(textstat.smog_index(textToCheck)) print "Automated Readability Index: " + str(textstat.automated_readability_index(textToCheck)) print "Coleman-Liau Index: " + str(textstat.coleman_liau_index(textToCheck)) print "Linsear Write Formula: " + str(textstat.linsear_write_formula(textToCheck)) print "Dale-Chall Readability Score: " + str(textstat.dale_chall_readability_score(textToCheck)) print "--------------------------------------------------------------" print "Readability Consensus based upon all the above tests: " + str(textstat.text_standard(textToCheck)) print "\n\n"
print(linsear_write_formula) print("The Fog Scale (Gunning FOG Formula)") # print(textstat.gunning_fog(test_data)) gunning_fog = textstat.gunning_fog(test_data) print(gunning_fog) print( "---------------------------------Summary----------------------------------" ) print("Readability Consensus based upon all the above tests") print( "Based upon all the above tests, " "returns the estimated school grade level required to understand the text." ) # print(textstat.text_standard(test_data)) school_grade_level = textstat.text_standard(test_data) print(school_grade_level) # l = [flesch_kincaid_grade, coleman_liau_index, automated_readability_index, linsear_write_formula, gunning_fog] # grade_level_avg = sum(l) / len(l) # print(grade_level_avg) print( "--------------------------------------------------------------------------" ) print( "\n------------------------------------------------------------------------------" ) print("Save every Thing on CSV file") list_row = [
def __init__(self, path): """ Create document instance for analysis. Opens and reads document to string raw_text. Textract interprets the document format and opens to plain text string (docx, pdf, odt, txt) Args: path (str): path to file to open, anaylze, close Public attributes: -user: (str) optional string to set username. -path: (str) relative path to document. -abs_path: (str) the absolute path to the document. -file_name: (str) the file name with extension of document (base name). -mime: tbd -guessed_type: makes best guess of mimetype of document. -file_type: returns index[0] from guessed_type. -raw_text: (str) plain text extracted from .txt, .odt, .pdf, .docx, and .doc. -ptext: (str) raw text after a series of regex expressions to eliminate special characters. -text_no_feed: (str) ptext with most new line characters eliminated /n/n stays intact. -sentence_tokens: list of all sentences in a comma separated list derived by nltk. -sentence_count: (int) count of sentences found in list. -passive_sentences: list of passive sentences identified by the passive module. -passive_sentence_count: count of the passive_sentences list. -percent_passive: (float) ratio of passive sentences to all sentences in percent form. -be_verb_analysis: (int) sum number of occurrences of each to be verb (am, is, are, was, were, be, being been). -be_verb_count: tbd -be_verb_analysis: tbd -weak_sentences_all: (int) sum of be verb analysis. -weak_sentences_set: (set) set of all sentences identified as having to be verbs. -weak_sentences_count: (int) count of items in weak_sentences_set. -weak_verbs_to_sentences: (float) proportion of sentences with to be to all sentences in percent (this might not be sound). -word_tokens: list of discreet words in text that breaks contractions up (default nltk tokenizer). -word_tokens_no_punct: list of all words in text including contractions but otherwise no punctuation. -no_punct: (str) full text string without sentence punctuation. -word_tokens_no_punct: uses white-space tokenizer to create a list of all words. -readability_flesch_re: (int) Flesch Reading Ease Score (numeric score) made by textstat module. -readability_smog_index: (int) grade level as determined by the SMOG algorithum made by textstat module. -readability_flesch_kincaid_grade: (int) Flesch-Kincaid grade level of reader made by textstat module. -readability_coleman_liau_index: (int) grade level of reader as made by textstat module. -readability_ari: (int) grade leader of reader determined by automated readability index algorithum implemented by textstat. -readability_linser_write: FIX SPELLING grade level as determined by Linsear Write algorithum implemented by textstat. -readability_dale_chall: (int) grade level based on Dale-Chall readability as determined by textstat. -readability_standard: composite grade level based on readability algorithums. -flesch_re_key: list for interpreting Flesch RE Score. -word_count: word count of document based on white space tokener, this word count should be used. -page_length: (float) page length in decimal format given 250 words per page. -paper_count: (int) number of printed pages given 250 words per page. -parts_of_speech: words with parts of speech tags. -pos_counts: values in word, tag couple grouped in a list (Counter). -pos_total: (int) sum of pos_counts values -pos_freq: (dict) word, ratio of whole -doc_pages: (float) page length based on 250 words per page (warning, this is the second time this attribute is defined). -freq_words: word frequency count not standardized based on the correct word tokener (not ratio, just count). modal_dist: count of auxillary verbs based on word_tokens_no_punct. sentence_count (int): Count the sentence tokens passive_sentences (list): List of all sentences identified as passive passive_sentence_count (int): count of items in passive_sentences be_verb_count (int): count "to be" verbs in text word_tokens_no_punct (list): words separated, stripped of punctuation, made lower case flesch_re_key (str): reading ease score to description freq_words (list or dict): frequency distribution of all words modal_dist (list): frequency distribution of aux verbs """ self.user = "" self.path = path self.abs_path = os.path.abspath(self.path) if os.path.isfile(self.path): self.time_stamp = self.timestamp() self.file_name = os.path.basename(path) self.mime = MimeTypes() self.guessed_type = self.mime.guess_type(self.path) self.file_type = self.guessed_type[0] self.raw_text = textract.process(self.path, encoding="ascii") self.ptext = re.sub(u'[\u201c\u201d]', '"', self.raw_text) self.ptext = re.sub(u"\u2014", "--", self.ptext) self.ptext = re.sub(",", ",", self.ptext) self.ptext = re.sub("—", "--", self.ptext) self.ptext = re.sub("…", "...", self.ptext) self.text_no_feed = self.clean_new_lines(self.ptext) self.sentence_tokens = self.sentence_tokenize(self.text_no_feed) self.sentence_count = len(self.sentence_tokens) self.passive_sentences = passive(self.text_no_feed) self.passive_sentence_count = len(self.passive_sentences) self.percent_passive = (100 * (float(self.passive_sentence_count) / float(self.sentence_count))) self.percent_passive_round = round(self.percent_passive, 2) self.be_verb_analysis = self.count_be_verbs(self.sentence_tokens) self.be_verb_count = self.be_verb_analysis[0] self.weak_sentences_all = self.be_verb_analysis[1] self.weak_sentences_set = set(self.weak_sentences_all) self.weak_sentences_count = len(self.weak_sentences_set) self.weak_verbs_to_sentences = 100 * float( self.weak_sentences_count) / float(self.sentence_count) self.weak_verbs_to_sentences_round = round( self.weak_verbs_to_sentences, 2) self.word_tokens = self.word_tokenize(self.text_no_feed) self.word_tokens_no_punct = \ self.word_tokenize_no_punct(self.text_no_feed) self.no_punct = self.strip_punctuation(self.text_no_feed) # use this! It make lower and strips symbols self.word_tokens_no_punct = self.ws_tokenize(self.no_punct) self.readability_flesch_re = \ textstat.flesch_reading_ease(self.text_no_feed) self.readability_smog_index = \ textstat.smog_index(self.text_no_feed) self.readability_flesch_kincaid_grade = \ textstat.flesch_kincaid_grade(self.text_no_feed) self.readability_coleman_liau_index = \ textstat.coleman_liau_index(self.text_no_feed) self.readability_ari = \ textstat.automated_readability_index(self.text_no_feed) self.readability_linser_write = \ textstat.linsear_write_formula(self.text_no_feed) self.readability_dale_chall = \ textstat.dale_chall_readability_score(self.text_no_feed) self.readability_standard = \ textstat.text_standard(self.text_no_feed) self.flesch_re_desc_str = self.flesch_re_desc( int(textstat.flesch_reading_ease(self.text_no_feed))) self.polysyllabcount = textstat.polysyllabcount(self.text_no_feed) self.lexicon_count = textstat.lexicon_count(self.text_no_feed) self.avg_syllables_per_word = textstat.avg_syllables_per_word( self.text_no_feed) self.avg_sentence_per_word = textstat.avg_sentence_per_word( self.text_no_feed) self.avg_sentence_length = textstat.avg_sentence_length( self.text_no_feed) self.avg_letter_per_word = textstat.avg_letter_per_word( self.text_no_feed) self.difficult_words = textstat.difficult_words(self.text_no_feed) self.rand_passive = self.select_random(self.passive_sentence_count, self.passive_sentences) self.rand_weak_sentence = self.select_random( len(self.weak_sentences), self.weak_sentences) if self.word_tokens_no_punct: self.word_count = len(self.word_tokens_no_punct) self.page_length = float(self.word_count) / float(250) self.paper_count = int(math.ceil(self.page_length)) self.parts_of_speech = pos_tag(self.word_tokens_no_punct) self.pos_counts = Counter( tag for word, tag in self.parts_of_speech) self.pos_total = sum(self.pos_counts.values()) self.pos_freq = dict( (word, float(count) / self.pos_total) for word, count in self.pos_counts.items()) self.doc_pages = float(float(self.word_count) / float(250)) self.freq_words = \ self.word_frequency(self.word_tokens_no_punct) self.modal_dist = self.modal_count(self.word_tokens_no_punct) # self.ws_tokens = self.ws_tokenize(self.text_no_cr) self.pos_count_dict = self.pos_counts.items() # Model - use for any pos self.modals = self.pos_isolate('MD', self.pos_count_dict) self.preposition_count = self.pos_isolate('IN', self.pos_count_dict) self.adjective_count = self.pos_isolate_fuzzy( 'JJ', self.pos_count_dict) self.adverb_count = self.pos_isolate_fuzzy('RB', self.pos_count_dict) self.proper_nouns = self.pos_isolate_fuzzy('NNP', self.pos_count_dict) self.cc_count = self.pos_isolate('CC', self.pos_count_dict) self.commas = self.char_count(",") self.comma_sentences = self.list_sentences(",") self.comma_example = self.select_random(len(self.comma_sentences), self.comma_sentences) self.semicolons = self.char_count(";") self.semicolon_sentences = self.list_sentences(";") self.semicolon_example = self.select_random( len(self.semicolon_sentences), self.semicolon_sentences) self.lint_suggestions = lint(self.raw_text)
def text_proc(corpus, urlDat={}, WORD_LIM=30, verbose=False): if type(corpus) is type(str()) and corpus not in str( "Redirecting"): # and not str("privacy policy") in corpus: if str("some error has occurred while processing your request" ) in corpus: return {} if str("We apologize for the inconvenience...") in corpus: return {} # if np.mean([len(w) for w in corpus]) > 35: # return {} corpus = corpus.replace("/", " ") # remove characters that nltk can't read corpus = corpus.lower() corpus = corpus.replace(u"\xa0", u" ") corpus = corpus.replace(u"\\", u" ") corpus, this_is_science = extract_science_block(corpus) if "semantic" in urlDat.keys(): if urlDat["semantic"]: this_is_science = True urlDat["science"] = this_is_science # print(corpus) # print(this_is_science, "this_is_science") urlDat["big_words"] = [word for word in corpus if len(word) > 16] ignoreSingleSentences = 1 corpus = cleanup_pretagger_all(corpus) if verbose: st.text("pretagger all") st.text(type(corpus)) tokens = word_tokenize(corpus) if verbose: st.text("token input") st.text(tokens) tokens = [t for t in tokens if t not in not_want_list] # if np.mean([len(t) for t in tokens]) > 50: # return {} # tokens = [t for t in tokens if len(t) < 50] # if verbose: # st.text("token input") # st.text(tokens) wc, sc, sylCount, remainingText, wordLen = countWordsSentSyl( tokens, ignoreSingleSentences=1) if len(tokens) < WORD_LIM: return {} if len(tokens) >= WORD_LIM: remainingText = " ".join(remainingText) remainingText = remainingText.lower() urlDat["standard"] = textstat.text_standard(remainingText, float_output=True) # st.markdown(urlDat["standard"]) if wc > 0 and sc > 0: if "semantic" in urlDat.keys() or urlDat["standard"] > 95: # else: # urlDat["hard_snippet"] = None urlDat["fre_unbiased"] = freeAlongtheText(corpus, chunk_length=512) fre = FRE(wc, sc, sylCount) if "semantic" in urlDat.keys(): if urlDat["semantic"]: ndc = NDC( remainingText, wc, sc ) # calc NDC Index and Perctage Diff Words #calc NDC index # if not "fre_unbiased" in urlDat.keys() and urlDat["standard"]>100: meanv, total, hard_snippet = complexityAlongtheText( corpus, chunk_length=256) urlDat["standard_unbiased"] = meanv # urlDat["standard"] = total # if this_is_science: if "semantic" in urlDat.keys(): urlDat["hard_snippet"] = hard_snippet # urlDat["fre"] = fre # textstat.text_standard(corpus, float_output=True) # urlDat["standard"] = ndc[0] # https://stackoverflow.com/questions/62492797/get-bibliography-list-and-its-count-from-text-python # print(urlDat["standard"]) """ if "fre_unbiased" in urlDat.keys(): if ( urlDat["fre_unbiased"] < urlDat["standard"] and urlDat["fre_unbiased"] > 0 ): urlDat["standard"] = urlDat["fre_unbiased"] if urlDat["standard"] == 0 and urlDat["fre_unbiased"] > 0: urlDat["standard"] = urlDat["fre_unbiased"] """ # if ( # urlDat["standard_unbiased"] < urlDat["standard"] # and urlDat["standard_unbiased"] > 0 # ): # urlDat["standard"] = urlDat["standard_unbiased"] # if fre<urlDat["standard"] and fre>0: # urlDat["standard"] = fre # if urlDat["standard"] > 60 and ndc[0]>0 and ndc[0]<60: # urlDat["standard"] = ndc[0] # urlDat["concensus"] = np.mean( # [ # np.mean(fre), # np.mean(urlDat["standard_unbiased"]), # ] # ) tokens = [w.lower() for w in tokens if w.isalpha()] tokens = [w.lower() for w in tokens] # make everything lower case urlDat["wcount"] = textstat.lexicon_count(str(tokens)) word_lim = bool(urlDat["wcount"] > WORD_LIM) # print(urlDat["tokens"]) if len(tokens): if "semantic" in urlDat.keys(): urlDat["tokens"] = tokens lexicon = textstat.lexicon_count(corpus, True) urlDat["uniqueness"] = len(set(tokens)) / float(len(tokens)) urlDat["unique_words"] = len(set(tokens)) # It's harder to have a good unique ratio in a long document, as 'and', 'the' and 'a', will dominate. # big deltas mean redudancy/sparse information/information/density testimonial = TextBlob(corpus) urlDat["sp"] = testimonial.sentiment.polarity urlDat["ss"] = testimonial.sentiment.subjectivity urlDat["sp_norm"] = np.abs(testimonial.sentiment.polarity) urlDat["ss_norm"] = np.abs(testimonial.sentiment.subjectivity) urlDat["gf"] = textstat.gunning_fog(corpus) if "standard" in urlDat.keys(): if urlDat["standard"] == 0: if verbose: st.text("gets here") # return {} return urlDat
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Dec 31 15:25:10 2016 @author: megan """ from textstat.textstat import textstat as ts fname = 'actbacFB.txt' with open(fname, 'r', encoding='utf-8') as f: data = f.read().replace('\n', '') total = ts.lexicon_count(data) difficult = ts.difficult_words(data) fkre = ts.flesch_reading_ease(data) grade = ts.flesch_kincaid_grade(data) overall = ts.text_standard(data) print("Total words:", total) print("Difficult words:", difficult) print("FKRE:", fkre) print("Grade:", grade) print("Overall readability", overall)
def text_proc(corpus, urlDat={}, WORD_LIM=100): #remove unreadable characters if type(corpus) is str and str('privacy policy') not in corpus: corpus = corpus.replace("-", " ") #remove characters that nltk can't read textNum = re.findall( r'\d', corpus) #locate numbers that nltk cannot see to analyze tokens = word_tokenize(corpus) stop_words = stopwords.words('english') #We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations. tokens = [word for word in tokens if not word in stop_words] tokens = [w.lower() for w in tokens] #make everything lower case # the kind of change that might break everything urlDat['wcount'] = textstat.lexicon_count(str(tokens)) word_lim = bool(urlDat['wcount'] > WORD_LIM) ## Remove the search term from the tokens somehow. urlDat['tokens'] = tokens if 'big_model' in urlDat.keys(): urlDat['perplexity'] = perplexity(corpus, urlDat['big_model']) else: urlDat['perplexity'] = None # Word limits can be used to filter out product merchandise websites, which otherwise dominate scraped results. # Search engine business model is revenue orientated, so most links will be for merchandise. urlDat['publication'] = publication_check(str(tokens))[1] urlDat['clue_words'] = clue_words(str(tokens))[1] if str('link') in urlDat.keys(): urlDat['clue_links'] = clue_links(urlDat['link'])[1] temp = len(urlDat['clue_words']) + len( urlDat['publication']) + len(urlDat['clue_links']) if temp > 10 and str('wiki') not in urlDat['link']: urlDat['science'] = True else: urlDat['science'] = False if str('wiki') in urlDat['link']: urlDat['wiki'] = True else: urlDat['wiki'] = False # The post modern essay generator is so obfuscated, that ENGLISH classification fails, and this criteria needs to be relaxed. not_empty = bool(len(tokens) != 0) if not_empty and word_lim: # and server_error: tokens = [w.lower() for w in tokens if w.isalpha()] #fdist = FreqDist(tokens) #frequency distribution of words only # The larger the ratio of unqiue words to repeated words the more colourful the language. lexicon = textstat.lexicon_count(corpus, True) urlDat['uniqueness'] = len(set(tokens)) / float(len(tokens)) # It's harder to have a good unique ratio in a long document, as 'and', 'the' and 'a', will dominate. # big deltas mean redudancy/sparse information/information/density urlDat['info_density'] = comp_ratio(corpus) #Sentiment and Subjectivity analysis testimonial = TextBlob(corpus) urlDat['sp'] = testimonial.sentiment.polarity urlDat['ss'] = testimonial.sentiment.subjectivity urlDat['sp_norm'] = np.abs(testimonial.sentiment.polarity) urlDat['ss_norm'] = np.abs(testimonial.sentiment.subjectivity) urlDat['gf'] = textstat.gunning_fog(corpus) # explanation of metrics # https://github.com/shivam5992/textstat urlDat['standard'] = textstat.text_standard(corpus, float_output=True) #urlDat['standard_'] = copy.copy(urlDat['standard'] ) # special sauce # Good writing should be readable, objective, concise. # The writing should be articulate/expressive enough not to have to repeat phrases, # thereby seeming redundant. Articulate expressive writing then employs # many unique words, and does not yield high compression savings. # Good writing should not be obfucstated either. The reading level is a check for obfucstation. # The resulting metric is a balance of concision, low obfucstation, expression. wc = float(1.0 / urlDat['wcount']) # compressed/uncompressed. Smaller is better. # as it means writing was low entropy, redundant, and easily compressible. urlDat['scaled'] = wc * urlDat['standard'] urlDat['conciseness'] = urlDat['wcount']*(urlDat['uniqueness']) + \ urlDat['wcount']*(urlDat['info_density']) urlDat['conciseness'] = bi_log_value(urlDat['conciseness']) if urlDat['perplexity'] is not None: urlDat['perplexity'] = bi_log_value(urlDat['perplexity']) penalty = (urlDat['standard'] + urlDat['conciseness']+\ urlDat['scaled'] + urlDat['perplexity'])/4.0 else: penalty = (urlDat['standard'] + urlDat['conciseness'] + urlDat['scaled']) / 3.0 #computes perplexity of the unigram model on a testset urlDat['penalty'] = penalty return urlDat
def text_proc(corpus, urlDat = {}, WORD_LIM = 100): #r emove unreadable characters corpus = corpus.replace("-", " ") #remove characters that nltk can't read textNum = re.findall(r'\d', corpus) #locate numbers that nltk cannot see to analyze tokens = word_tokenize(corpus) tokens = [w.lower() for w in tokens] #make everything lower case # the kind of change that might break everything urlDat['wcount'] = textstat.lexicon_count(str(tokens)) word_lim = bool(urlDat['wcount'] > WORD_LIM) # Word limits can be used to filter out product merchandise websites, which otherwise dominate scraped results. # Search engine business model is revenue orientated, so most links will be for merchandise. try: urlDat['english'] = english_check(corpus) urlDat['clue_words'] = clue_words(corpus) urlDat['clue_links'] = clue_links(urlDat['link']) except: urlDat['english'] = True urlDat['clue_words'] = (False,[]) urlDat['clue_links'] = (False,[]) # The post modern essay generator is so obfuscated, that ENGLISH classification fails, and this criteria needs to be relaxed. not_empty = bool(len(tokens) != 0) if not_empty and urlDat['english'] and word_lim: # and server_error: tokens = [ w.lower() for w in tokens if w.isalpha() ] #fdist = FreqDist(tokens) #frequency distribution of words only # The larger the ratio of unqiue words to repeated words the more colourful the language. lexicon = textstat.lexicon_count(corpus, True) urlDat['uniqueness'] = len(set(tokens))/float(len(tokens)) # It's harder to have a good unique ratio in a long document, as 'and', 'the' and 'a', will dominate. # big deltas mean redudancy/sparse information/information/density # Rationale this metric. # Different papers and diffferent scientific concepts, # incur very different degrees of irreducible complexity # intrinsic to the complexity of the concepts they are tasked with communicating. # Assumption 1: the stanford analysis is too basic to accomodate for differences in # intrinsic complexity of concepts # Assumption 2: Information theory may be sensitive to intrinsic irreducible complexity urlDat['info_density'] = comp_ratio(corpus) # Fudge factor: # The log should be moved to plotting. #scaled_density = -1.0 * abs(urlDat['info_density'] * (1.0/urlDat['wcount'])) #urlDat['scaled_info_density'] = scaled_density #Sentiment and Subjectivity analysis testimonial = TextBlob(corpus) urlDat['sp'] = testimonial.sentiment.polarity urlDat['ss'] = testimonial.sentiment.subjectivity urlDat['gf'] = textstat.gunning_fog(corpus) # explanation of metrics # https://github.com/shivam5992/textstat standard_ = textstat.text_standard(corpus) try: urlDat['standard'] = float(standard_[0:2]) except: urlDat['standard'] = float(standard_[0:1]) # special sauce # Good writing should be readable, objective, concise. # The writing should be articulate/expressive enough not to have to repeat phrases, # thereby seeming redundant. Articulate expressive writing then employs # many unique words, and does not yield high compression savings. # Good writing should not be obfucstated either. The reading level is a check for obfucstation. # The resulting metric is a balance of concision, low obfucstation, expression. penalty = urlDat['standard'] + abs(urlDat['sp']) + abs(urlDat['ss']) # +float(scaled_density) urlDat['penalty'] = penalty return urlDat
print(textstat.flesch_reading_ease(test_data)) print("The SMOG Index") print("Texts of fewer than 30 sentences are statistically invalid, " "because the SMOG formula was normed on 30-sentence samples.") print("textstat requires atleast 3 sentences for a result.") print(textstat.smog_index(test_data)) print("The Flesch-Kincaid Grade") print(textstat.flesch_kincaid_grade(test_data)) print("The Coleman-Liau Index") print(textstat.coleman_liau_index(test_data)) print("Automated Readability Index (ARI)") print(textstat.automated_readability_index(test_data)) print("Dale-Chall Readability Score") print(textstat.dale_chall_readability_score(test_data)) print("Linsear Write Formula") print(textstat.linsear_write_formula(test_data)) print("The Fog Scale (Gunning FOG Formula)") print(textstat.gunning_fog(test_data)) print( "---------------------------------Summary----------------------------------" ) print("Readability Consensus based upon all the above tests") print( "Based upon all the above tests, " "returns the estimated school grade level required to understand the text." ) print(textstat.text_standard(test_data)) print( "--------------------------------------------------------------------------" )
'../output_text/trump_out.txt', '../output_text/shakespeare_out.txt', '../output_text/drseuss_out.txt' ] # input_file_names = ['../data_parsed/trump.txt', input_file_names = [ '../data_parsed/shakespeare.txt', '../data_parsed/drseuss.txt' ] for i in range(0, len(input_file_names)): input_file_name = input_file_names[i] print(input_file_name) with open(input_file_name, 'r') as myfile: test_data = myfile.read().replace('\n', '') print "flesch_reading_ease: " + str( textstat.flesch_reading_ease(test_data)) print "smog_index: " + str(textstat.smog_index(test_data)) print "flesch_kincaid_grade: " + str( textstat.flesch_kincaid_grade(test_data)) print "coleman_liau_index: " + str(textstat.coleman_liau_index(test_data)) print "automated_readability_index: " + str( textstat.automated_readability_index(test_data)) print "dale_chall_readability_score: " + str( textstat.dale_chall_readability_score(test_data)) print "difficult_words: " + str(textstat.difficult_words(test_data)) print "linsear_write_formula: " + str( textstat.linsear_write_formula(test_data)) print "gunning_fog: " + str(textstat.gunning_fog(test_data)) print "text_standard: " + str(textstat.text_standard(test_data))
v = cr[k] gl = [] for s in tqdm(v): if (gl == []): gl.append(textstat.flesch_kincaid_grade(s) / len(v)) gl.append(textstat.smog_index(s) / len(v)) gl.append(textstat.automated_readability_index(s) / len(v)) gl.append(textstat.dale_chall_readability_score(s) / len(v)) gl.append(textstat.coleman_liau_index(s) / len(v)) gl.append(textstat.linsear_write_formula(s) / len(v)) gl.append(textstat.gunning_fog(s) / len(v)) else: gl[0] += textstat.flesch_kincaid_grade(s) / len(v) gl[1] += textstat.smog_index(s) / len(v) gl[2] += textstat.automated_readability_index(s) / len(v) gl[3] += textstat.dale_chall_readability_score(s) / len(v) gl[4] += textstat.coleman_liau_index(s) / len(v) gl[5] += textstat.linsear_write_formula(s) / len(v) gl[6] += textstat.gunning_fog(s) / len(v) t = "" for s in v: t += s gl.append(textstat.text_standard(t)) data.append([k] + gl) except: print "null" with open('speaker_map_all.csv', 'w') as file: writer = csv.writer(file) writer.writerows(data)
def get_grade_level(block): consensus = textstat.text_standard(block) return float(consensus[0]) + .5
testimonial = TextBlob(soup_nocode) polarity_val = testimonial.sentiment.polarity subjectivity_val = testimonial.sentiment.subjectivity #print("\nAverage sentiment::\n") #print(testimonial.sentiment) # for t in testimonial.sentences: # print("\nsentiment :::\n ") # print(t) # print(t.sentiment) #calculate readability #print("\nAverage readability::\n") if (len(soup_nohtml) > 1): readability_val = textstat.text_standard(soup_nohtml) readability_score = readability_val.split("th")[0] else: readability_score = 0 #get the number of the lower grade #print(readability_val) #print(readability_score) # update values # alterando os dados da tabela cursor.execute( """
FILES = [FILE_OR_DIR] for FILE in FILES: print 'Processing', FILE TEXT = read_file(FILE) print 'Flesh reading ease', textstat.flesch_reading_ease(TEXT) print 'Smog index', textstat.smog_index(TEXT) print 'Flesch Kincaid grade', textstat.flesch_kincaid_grade(TEXT) print 'Coleman Liau', textstat.flesch_kincaid_grade(TEXT) print 'Automated readability index', textstat.automated_readability_index(TEXT) print 'Dale Chall readability score', textstat.dale_chall_readability_score(TEXT) print 'Difficult words', textstat.difficult_words(TEXT) print 'Linsear write formula', textstat.linsear_write_formula(TEXT) print 'Gunning fog', textstat.gunning_fog(TEXT) print 'Text standard', textstat.text_standard(TEXT) print '\nWords' WORDS = get_words(TEXT) get_word_stats(WORDS) print '\nWords no Stop Words' WORDS_NO_STOP = [w for w in WORDS if w not in stop] get_word_stats(WORDS_NO_STOP) print '\nSentences' SENTENCES = get_sentences(TEXT) get_sentence_stats(SENTENCES) print WORD_SETS[FILE_OR_DIR] |= set(WORDS)
def test_text_standard(self): standard = textstat.text_standard(self.long_test) self.assertEqual("9th and 10th grade", standard)
def updateData(self): # Full list of polarity scores self.polscore = self.sid.polarity_scores(self.text) ##### INDEX 0 IN DATA: Text Sentiment ##### # [INDEX 0] Compounded score (0.0 - 1.0) [INDEX 1] Negative connotation rating (0.0 - 1.0), # [INDEX 2] Positive connotation rating (0.0 - 1.0) [INDEX 3] Neutral connotation rating (0.0 - 1.0) self.data.append([ self.polscore['compound'], self.polscore['neg'], self.polscore['pos'], self.polscore['neu'] ]) ##### INDEX 1 IN DATA: Sentence Info ##### # [INDEX 0] Sentence count [INDEX 1] Average sentence length # [INDEX 2] Syllable count [INDEX 3] Overall word count # [INDEX 4] Character count [INDEX 5] Character count without spaces # [INDEX 6] Avg letters per word [INDEX 7] Avg syllables per word self.data.append([ textstat.sentence_count(self.text), textstat.avg_sentence_length(self.text), textstat.syllable_count(self.text), len(self.splList), textstat.char_count(self.text, False), textstat.char_count(self.text, True), textstat.avg_letter_per_word(self.text), textstat.avg_syllables_per_word(self.text) ]) ##### INDEX 2 IN DATA: Flesch Reading Ease ##### # [INDEX 0] Pure score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 100 self.freRaw = textstat.flesch_reading_ease(self.text) self.freStat = min(max(self.freRaw, 0), 100) self.data.append([ round(self.freStat, 3), self.freGrade(self.freStat), round(abs(self.freStat - 100), 2) ]) ##### INDEX 3 IN DATA: Flesch-Kincaid Grade ##### # [INDEX 0] Pure score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.fkgRaw = textstat.flesch_kincaid_grade(self.text) self.fkgStat = self.adjustScore(self.fkgRaw) self.data.append([ round(self.fkgStat, 3), self.grade(self.fkgStat), round(self.fkgStat / 0.18, 2) ]) ##### INDEX 4 IN DATA: Gunning FOG Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.fogRaw = textstat.gunning_fog(self.text) self.fogStat = self.adjustScore(self.fogRaw) self.data.append([ round(self.fogStat, 3), self.grade(self.fogStat), round(self.fogStat / 0.18, 2) ]) ##### INDEX 5 IN DATA: SMOG Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.smogRaw = textstat.smog_index(self.text) self.smogStat = self.adjustScore(self.smogRaw) self.data.append([ round(self.smogStat, 3), self.grade(self.smogStat), round(self.smogStat / 0.18, 2) ]) ##### INDEX 6 IN DATA: Automated Readability Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 14 self.ariRaw = textstat.automated_readability_index(self.text) self.ariStat = min(max(self.ariRaw, 0), 14) self.data.append([ round(self.ariStat, 3), self.ariGrade(ceil(self.ariStat)), round(self.ariStat / 0.14, 2) ]) #13 ##### INDEX 7 IN DATA: Coleman-Liau Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.cliRaw = textstat.coleman_liau_index(self.text) self.cliStat = self.adjustScore(self.cliRaw) self.data.append([ round(self.cliStat, 3), self.grade(self.cliStat), round(self.cliStat / 0.18, 2) ]) ##### INDEX 8 IN DATA: Linsear Write Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.lwiRaw = textstat.linsear_write_formula(self.text) self.lwiStat = self.adjustScore(self.lwiRaw) self.data.append([ round(self.lwiStat, 3), self.grade(self.lwiStat), round(self.lwiStat / 0.18, 2) ]) ##### INDEX 9 IN DATA: Dale-Chall Readability Score ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 10 self.dcrRaw = textstat.dale_chall_readability_score(self.text) self.dcrStat = min(max(self.dcrRaw, 0), 10) self.data.append([ round(self.dcrStat, 3), self.daleChallGrade(self.dcrStat), round(self.dcrStat / 0.1, 2) ]) ##### INDEX 10 IN DATA: Overall Score ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 20 self.txtRaw = textstat.text_standard(self.text, True) self.txtStd = min(max(self.txtRaw, 0), 20) self.txtInfo = textstat.text_standard(self.text) self.data.append([ round(self.txtStd, 3), self.txtGrade(self.txtStd, self.txtInfo), round(self.txtStd / 0.2, 2) ]) return self.data
def reading_level_comp(string): try: level = textstat.text_standard(string) return level except: return "Unclear"