def get_text_features(article_contents: str) -> dict: """ Takes an article's contents and analyzes its complexity using numerous reading scores and methods. Also calculates other factors such as the number of typos. @param article_contents, a string which contains the contents of an article @return language_analysis_dict, a dictionary which contains """ tool = language_check.LanguageTool('en-US') language_analysis_dict = { "flesch_reading": textstat.flesch_reading_ease(article_contents), "flesch_kincaid": textstat.flesch_kincaid_grade(article_contents), "coleman_liau": textstat.coleman_liau_index(article_contents), "typos_to_words": len(tool.check(article_contents)) / textstat.lexicon_count(article_contents), "percent_difficult_words": textstat.difficult_words(article_contents) / textstat.lexicon_count(article_contents), } return language_analysis_dict
def _get_base_textstats(no_code_text): """ Find basic text statistics :param no_code_text: Text we are analyzing :return: list: List of results """ results = [] group_by = 'Basic Text Statistics' num_chars = len(no_code_text) num_lower = sum(1 for c in no_code_text if c.islower()) num_upper = sum(1 for c in no_code_text if c.isupper()) num_letters = sum(1 for c in no_code_text if c.isalpha()) num_numbers = sum(1 for c in no_code_text if c.isdigit()) num_alphanum = sum(1 for c in no_code_text if c.isalnum()) num_otherchars = num_chars - num_alphanum results.append(TextFeature('Number of characters', num_chars, group_by)) results.append(TextFeature('Number of letters', num_letters, group_by)) results.append(TextFeature('Number of numbers', num_numbers, group_by)) results.append(TextFeature('Number of other characters', num_otherchars, group_by)) character_counts = Counter(no_code_text.lower()) for c in sorted(character_counts.items()): try: results.append(TextFeature('Character count for "{}"'.format(c[0].encode('unicode_escape')), c[1], group_by)) except AttributeError: results.append(TextFeature('Character count for "{}"'.format(c[0]), c[1], group_by)) results.append(TextFeature('Number of syllables', textstat.syllable_count(no_code_text), group_by)) results.append(TextFeature('Lexicon Count (without punctuation)', textstat.lexicon_count(no_code_text, True), group_by)) results.append(TextFeature('Lexicon Count (with punctuation)', textstat.lexicon_count(no_code_text, False), group_by)) results.append(TextFeature('Number of lower case characters', num_lower, group_by)) results.append(TextFeature('Number of upper case characters', num_upper, group_by)) return results
def calculate_statistics(lyrics): """ Calculates statistics based on the text_raw of the lyrics. :return: Annotated lyrics containing information about the songs """ logging.info("Calculating Statistics") from textstat.textstat import textstat for idx, song in tqdm(enumerate(lyrics), total=len(lyrics)): try: song["num_syllables"] = textstat.syllable_count(song["text_raw"]) song["num_words"] = textstat.lexicon_count(song["text_raw"]) song["num_sentences"] = textstat.sentence_count(song["text_raw"]) song["flesch_score"] = textstat.flesch_reading_ease( song["text_raw"]) song["flesch_kincaid_level"] = textstat.flesch_kincaid_grade( song["text_raw"]) song["fog_score"] = textstat.gunning_fog(song["text_raw"]) song[ "num_difficult_words"] = textstat.dale_chall_readability_score( song["text_raw"]) except Exception as e: logging.error( "Something bad happened in the current song ! Skipping it... \n{}" .format(song)) logging.exception(e) return lyrics
def main(): csv_file2 = open(sys.argv[2], 'w', encoding="utf8") writer = csv.writer(csv_file2, delimiter=',') doc_id = 1 writer.writerow(["ID", "URL", "text", "impact-score", "readability", "grade-level", "smog-index", "total-words", "total-sentences"]) with open(sys.argv[1], 'r', encoding="utf8", errors='ignore') as csv_file1: reader = csv.reader(csv_file1) # Skip the first line with headers next(reader) for row in reader: impact = str(row[0]) url = str(row[1]) text = str(row[2]) read_ease = textstat.flesch_reading_ease(text) grade = textstat.flesch_kincaid_grade(text) smog = textstat.smog_index(text) words = textstat.lexicon_count(text) sentences = textstat.sentence_count(text) # Uncomment this if we want summary and key words # summary = summarize(text, ratio=0.3) # key_words = keywords(text, ratio=0.3) writer.writerow([doc_id]+[url]+[text]+[impact]+[read_ease]+[grade]+[smog]+[words]+[sentences]) doc_id = doc_id+1 csv_file1.close() csv_file2.close() print('Summary statistics complete!')
def _calculate_scores(self, docs): docs_scores = [] for doc in docs: scores = {} scores['chars'] = ts.char_count(doc) scores['words'] = ts.lexicon_count(doc) scores['sents'] = ts.sentence_count(doc) #scores['syllables'] = ts.syllable_count(doc) scores['avg_sent_length'] = ts.avg_sentence_length(doc) scores['avg_syllables_per_word'] = ts.avg_syllables_per_word(doc) scores['avg_letters_per_word'] = ts.avg_letter_per_word(doc) scores['flesch'] = ts.flesch_reading_ease(doc) #scores['smog'] = ts.smog_index(doc) #scores['coleman_liau'] = ts.coleman_liau_index(doc) scores['automated_readability'] = ts.automated_readability_index( doc) #scores['linsear'] = ts.linsear_write_formula(doc) #scores['difficult_words'] = ts.difficult_words(doc) scores['dale_chall'] = ts.dale_chall_readability_score(doc) #scores['gunning_fog'] = ts.gunning_fog(doc) scores['lix'] = ts.lix(doc) docs_scores.append(scores) return docs_scores
def main(): """ Evaluate and print Readability scores """ if len(sys.argv) > 1: inf = open(sys.argv[1], 'r') else: sys.stderr.write('Error: specify input file.\n') sys.exit() text = inf.read() inf.close() lexcount = textstat.lexicon_count(text) sys.stdout.write('Lexicon count: {0:d}\n'.format(lexcount)) # reading time in minutes # assumes 180 WPM plus some offset tread = (lexcount + 250) / 180. sys.stdout.write('Estimating reading time: {0:1.1f} minutes.\n'.format(tread)) ease = textstat.flesch_reading_ease(text) grade = textstat.flesch_kincaid_grade(text) sys.stdout.write('Flesch reading ease score: {0:1.1f}\n'.format(ease)) sys.stdout.write('Flesch-Kincaid grade: {0:1.1f}\n'.format(grade))
def split_pages(text, page_words=WORDS_PAGE): paragraphs = text.split("\n\n") pages = [] working = '' for para in paragraphs: working = working + para if ts.lexicon_count(working) >= page_words: pages.append(working) working = '' if not ts.lexicon_count(working) == 0: pages.append(working) return pages
def do_text_stats(self, text): ### Syllable Count syllable_count = textstat.syllable_count(text) ### Lexicon Count lexicon_count = textstat.lexicon_count(text, True) ### Sentence Count sentence_count = textstat.sentence_count(text) ### The Flesch Reading Ease formula try: flesch_reading_ease = textstat.flesch_reading_ease(text) except TypeError as e: flesch_reading_ease = None #* 90-100 : Very Easy #* 80-89 : Easy #* 70-79 : Fairly Easy #* 60-69 : Standard #* 50-59 : Fairly Difficult #* 30-49 : Difficult #* 0-29 : Very Confusing ### The The Flesch-Kincaid Grade Level try: flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) except TypeError as e: flesch_kincaid_grade = None ## The Fog Scale (Gunning FOG Formula) gunning_fog = textstat.gunning_fog(text) ### The SMOG Index smog_index = textstat.smog_index(text) ### Automated Readability Index automated_readability_index = textstat.automated_readability_index( text) ### The Coleman-Liau Index try: coleman_liau_index = textstat.coleman_liau_index(text) except TypeError as e: coleman_liau_index = None ### Linsear Write Formula linsear_write_formula = textstat.linsear_write_formula(text) ### Dale-Chall Readability Score dale_chall_readability_score = textstat.dale_chall_readability_score( text) ### Readability Consensus based upon all the above tests try: text_standard = textstat.text_standard(text) except TypeError as e: text_standard = None return { "syllable_count": syllable_count, "lexicon_count": lexicon_count, "sentence_count": sentence_count, "flesch_reading_ease": flesch_reading_ease, "flesch_kincaid_grade": flesch_kincaid_grade, "gunning_fog": gunning_fog, "smog_index": smog_index, "automated_readability_index": automated_readability_index, "coleman_liau_index": coleman_liau_index, "linsear_write_formula": linsear_write_formula, "dale_chall_readability_score": dale_chall_readability_score, "text_standard": text_standard }
def get_special_metrics(text): blob = TextBlob(text) main = { 'statistics': { 'syllables': textstat.syllable_count(text), 'words': textstat.lexicon_count(text), 'characters': textstat.char_count(text), 'polysyllables': textstat.polysyllabcount(text), 'average letter per word': textstat.avg_letter_per_word(text), 'average sentence length': textstat.avg_sentence_length(text), 'average sentence per word': textstat.avg_sentence_per_word(text), 'sentences': textstat.sentence_count(text) }, 'difficulty': { 'flesch reading ease': textstat.flesch_reading_ease(text), 'smog index': textstat.smog_index(text), 'flesch kincaid grade': textstat.flesch_kincaid_grade(text), 'coleman liau index': textstat.coleman_liau_index(text), #'automated readability index': textstat.automated_readability_index(text), #'dale chall readability score': textstat.dale_chall_readability_score(text), #'difficult words': textstat.difficult_words(text), #'linsear write formula': textstat.linsear_write_formula(text), 'gunning fog': textstat.gunning_fog(text) }, 'sentiments': { 'polarity': blob.sentiment.polarity, 'subjectivity': blob.sentiment.subjectivity } } return main
def text_analytics(text): if textstat.sentence_count(text) != 0: lexicon = textstat.lexicon_count(text) #word count sent = textstat.sentence_count(text) #sentence count syll = textstat.syllable_count(text) #syllable count flesch = textstat.flesch_reading_ease(text) #flesch score smog = textstat.smog_index(text) #SMOG index fog = textstat.gunning_fog(text) #FOG index dale = textstat.dale_chall_readability_score(text) #grade level ari = textstat.automated_readability_index(text) #grade level cl = textstat.coleman_liau_index(text) #grade level flesch1 = lexicon*flesch flesch2 = sent*flesch flesch3 = syll*flesch smog1 = lexicon*smog smog2 = sent*smog smog3 = syll*smog fog1 = lexicon*fog fog2 = sent*fog fog3 = syll*fog dale1 = lexicon*dale dale2 = sent*dale dale3=syll*dale ari1 = lexicon*ari ari2 = sent*ari ari3 = syll*ari cl1 = lexicon*cl cl2 = sent*cl cl3 = syll*cl x=[lexicon,sent,syll,flesch,smog,fog,dale,ari,cl,flesch1,flesch2,flesch3,smog1, smog2,smog3,fog1,fog2,fog3,dale1,dale2,dale3,ari1,ari2,ari3,cl1,cl2,cl3] return(x)
def get_special_metrics(text): blob = TextBlob(text) main = { "statistics": { "syllables": textstat.syllable_count(text), "words": textstat.lexicon_count(text), "characters": textstat.char_count(text), "polysyllables": textstat.polysyllabcount(text), "average letter per word": textstat.avg_letter_per_word(text), "average sentence length": textstat.avg_sentence_length(text), "average sentence per word": textstat.avg_sentence_per_word(text), "sentences": textstat.sentence_count(text), }, "difficulty": { "flesch reading ease": textstat.flesch_reading_ease(text), "smog index": textstat.smog_index(text), "flesch kincaid grade": textstat.flesch_kincaid_grade(text), "coleman liau index": textstat.coleman_liau_index(text), #'automated readability index': textstat.automated_readability_index(text), #'dale chall readability score': textstat.dale_chall_readability_score(text), #'difficult words': textstat.difficult_words(text), #'linsear write formula': textstat.linsear_write_formula(text), "gunning fog": textstat.gunning_fog(text), }, "sentiments": {"polarity": blob.sentiment.polarity, "subjectivity": blob.sentiment.subjectivity}, } return main
def analyse_plain_text(test_data): text_stats = TextStats() # Do some simple analysis. from textblob import TextBlob zen = TextBlob(test_data) text_stats.word_count = len(zen.words) text_stats.sentence_count = len(zen.sentences) text_stats.polarity = zen.sentiment.polarity text_stats.subjectivity = zen.sentiment.subjectivity # Easy to read, this? from textstat.textstat import textstat text_stats.flesch_reading_ease = textstat.flesch_reading_ease(test_data) # Words per sentence count. from textstat.textstat import textstat text_stats.word_per_sentence_count = ( textstat.lexicon_count(test_data, False) / textstat.sentence_count(test_data)) # Convert all to lower. test_data = test_data.lower() # Tokenise. from nltk.tokenize import word_tokenize words = word_tokenize(test_data) # Tokenise stemmed text. from nltk.stem import PorterStemmer ps = PorterStemmer() test_data_stemmed = '' for w in words: test_data_stemmed = test_data_stemmed + ' ' + ps.stem(w) stemmed_words = word_tokenize(test_data_stemmed) # Remove non-words. nonPunct = re.compile('.*[A-Za-z0-9].*') # must contain a letter or digit filtered = [w for w in stemmed_words if nonPunct.match(w)] # Remove stopwords: from nltk.corpus import stopwords stopwords = set(stopwords.words('english')) extra_stopwords = set([ 'that', '\'s', 'wa', 'thi', 'like', 'n\'t', 'would', 'ha', 'us', 'get' ]) filtered = [ w for w in filtered if w not in stopwords and w not in extra_stopwords ] # How many unique words? from collections import Counter counts = Counter(filtered) text_stats.unique_word_count = len(counts) # Words sorted by most common. text_stats.counts = counts return text_stats
def composition(text, file): char_count = textstat.char_count(text) syll_count = textstat.syllable_count(text) lex_count = textstat.lexicon_count(text) sent_count = textstat.sentence_count(text) file.write( '\nChar count : %d\nSyllabus count : %d \nLexicon count : %d \nSentence count : %d' % (char_count, syll_count, lex_count, sent_count))
def _get_base_textstats(no_code_text): """ Find basic text statistics :param no_code_text: Text we are analyzing :return: list: List of results """ results = [] group_by = 'Basic Text Statistics' num_chars = len(no_code_text) num_lower = sum(1 for c in no_code_text if c.islower()) num_upper = sum(1 for c in no_code_text if c.isupper()) num_letters = sum(1 for c in no_code_text if c.isalpha()) num_numbers = sum(1 for c in no_code_text if c.isdigit()) num_alphanum = sum(1 for c in no_code_text if c.isalnum()) num_otherchars = num_chars - num_alphanum results.append(TextFeature('Number of characters', num_chars, group_by)) results.append(TextFeature('Number of letters', num_letters, group_by)) results.append(TextFeature('Number of numbers', num_numbers, group_by)) results.append( TextFeature('Number of other characters', num_otherchars, group_by)) character_counts = Counter(no_code_text.lower()) for c in sorted(character_counts.items()): try: results.append( TextFeature( 'Character count for "{}"'.format( c[0].encode('unicode_escape')), c[1], group_by)) except AttributeError: results.append( TextFeature('Character count for "{}"'.format(c[0]), c[1], group_by)) results.append( TextFeature('Number of syllables', textstat.syllable_count(no_code_text), group_by)) results.append( TextFeature('Lexicon Count (without punctuation)', textstat.lexicon_count(no_code_text, True), group_by)) results.append( TextFeature('Lexicon Count (with punctuation)', textstat.lexicon_count(no_code_text, False), group_by)) results.append( TextFeature('Number of lower case characters', num_lower, group_by)) results.append( TextFeature('Number of upper case characters', num_upper, group_by)) return results
def get_statistics(self, f, content): content = content.lower() reading_level = textstat.flesch_kincaid_grade(content) word_count = textstat.lexicon_count(content) keyword_frequency = map(lambda x: x[1], self.get_keyword_frequency(content)) sentiment = DocumentStatistics.get_sentiment(content) return [f, reading_level, word_count] + keyword_frequency + sentiment
def textstat_analysis(profile_text): fre = textstat.flesch_reading_ease(profile_text) smog = textstat.smog_index(profile_text) fkg = textstat.flesch_kincaid_grade(profile_text) coleman = textstat.coleman_liau_index(profile_text) ari = textstat.automated_readability_index(profile_text) dale = textstat.dale_chall_readability_score(profile_text) dw = textstat.difficult_words(profile_text) lwf = textstat.linsear_write_formula(profile_text) gf = textstat.gunning_fog(profile_text) rc = textstat.readability_consensus(profile_text) word_count = textstat.lexicon_count(profile_text) return (fre, smog, fkg, coleman, ari, dale, dw, lwf, gf, rc, word_count)
def get_stats(sentence): syllables = textstat.syllable_count(sentence) words = textstat.lexicon_count(sentence, True) sentence_count = textstat.sentence_count(sentence) if sentence_count > 0: text_standard = textstat.text_standard(sentence) else: text_standard = EMPTY_TEXT_STANDARD text_standard = fix_grammar_errors(text_standard) return combine(syllables, words, sentence_count, text_standard)
def __load_text(self): tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') with codecs.open('{}/{}'.format(local_data_dir, self.filename), 'r', encoding = 'utf8', errors = 'ignore') as f: data = f.read() self.flesch_reading_ease = textstat.flesch_reading_ease(data) self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(data) sentences = tokenizer.tokenize(data) self.n_sentences = textstat.sentence_count(data) self.avg_sentence_length = textstat.lexicon_count(data, True) * 1. / self.n_sentences self.avg_word_length = np.mean([len(w) for s in sentences for w in s.split(' ') if w not in stopwords.words('english')]) print 'Parse ', len(sentences), ' sentences, average sentence length ', self.avg_sentence_length, ', average word length ', self.avg_word_length self.sentences = sentences self.tokens = [] [self.tokens.extend(text_tokenize(sentence)) for sentence in sentences]
def main() : for arg in sys.argv[1:]: with open(arg) as f: text = f.read() with open(arg + '.readability.snip','w') as f: f.write ("syllable_count : %s\n" % textstat.syllable_count(text)) f.write ("lexicon_count : %s\n" % textstat.lexicon_count(text)) f.write ("sentence_count : %s\n" % textstat.sentence_count(text)) f.write ("difficult_words : %s\n" % textstat.difficult_words(text)) f.write ("flesch_reading_ease : %s\n" % textstat.flesch_reading_ease(text)) f.write ("flesch_kincaid_grade : %s\n" % textstat.flesch_kincaid_grade(text)) f.write ("smog_index : %s\n" % textstat.smog_index(text)) f.write ("automated_readability_index : %s\n" % textstat.automated_readability_index(text)) f.write ("coleman_liau_index : %s\n" % textstat.coleman_liau_index(text)) f.write ("linsear_write_formula : %s\n" % textstat.linsear_write_formula(text)) f.write ("dale_chall_readability_score : %s\n" % textstat.dale_chall_readability_score(text))
def scores_cal_ori(text): char_count_value=textstat.char_count(text,ignore_spaces=True) lexicon_count_value=textstat.lexicon_count(text,removepunct=True) syllable_count_value=textstat.syllable_count(text) sentence_count_value=textstat.sentence_count(text) avg_sentence_length_value=textstat.avg_sentence_length(text) avg_syllables_per_word_value=textstat.avg_syllables_per_word(text) avg_letter_per_word_value=textstat.avg_letter_per_word(text) avg_sentence_per_word_value=textstat.avg_sentence_per_word(text) flesch_kincaid_grade_value=textstat.flesch_kincaid_grade(text) smog_index_value=textstat.smog_index(text) gunning_fog_value=textstat.gunning_fog(text) difficult_words_value=textstat.difficult_words(text) dale_chall_value=textstat.dale_chall_readability_score(text) polysyllab_value=textstat.polysyllabcount(text) return char_count_value,lexicon_count_value,syllable_count_value,sentence_count_value,avg_sentence_length_value,avg_syllables_per_word_value,avg_letter_per_word_value,avg_sentence_per_word_value,flesch_kincaid_grade_value,smog_index_value,gunning_fog_value,difficult_words_value,dale_chall_value,polysyllab_value return smog_index_value
def analyse_json(json_text): # consider moving this to be a feature of Transcript in the other module df_witnesses = pd.DataFrame(columns=['html_file_location', 'witness_name', 'syllable_count','lexicon_count', 'sentence_count', 'syllables_per_word', 'gunning_fog', 'smog_index', 'text_standard'], index=[]) trscrpt = json.loads(json_text) if 'witnesses' in trscrpt: witnesses = trscrpt['witnesses'] for s in trscrpt['all_sections']: if 'speaker' in s and 'person' in s['speaker'] and \ s['speaker']['person']['speaker_type']=='witness': witness = witnesses[s['speaker']['person']['name']] witness.setdefault('all_text', []).append(s['spoken_text']) for i, p in enumerate(witnesses): if 'all_text' in witnesses[p]: witness_text = '\n\n'.join(witnesses[p]['all_text']) if len(witness_text) > 0: stats_data = {'html_file_location': trscrpt['html_file_location'], 'witness_name': p, 'syllable_count': textstat.syllable_count(witness_text), 'lexicon_count': textstat.lexicon_count(witness_text), 'sentence_count': textstat.sentence_count(witness_text), 'syllables_per_word': textstat.avg_syllables_per_word(witness_text), 'gunning_fog': textstat.gunning_fog(witness_text), 'smog_index': textstat.smog_index(witness_text), 'text_standard': textstat.text_standard(witness_text)} df_witnesses.loc['witness_%i' % i] = stats_data else: df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location'] df_witnesses.loc['witness_%i' % i, 'witness_name'] = p else: df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location'] df_witnesses.loc['witness_%i' % i, 'witness_name'] = p return df_witnesses
def count_partsofspeech(article_contents: str) -> dict: """ Returns the number of adjectives in a given string. @param article_contents, a string containing a news article @return pos_dict, which contains the parts of speech breakdown of an article """ pos_dict = {} text = nltk.word_tokenize(article_contents) for word in nltk.pos_tag(text): if word[1] in pos_dict: pos_dict[word[1]] += 1 else: pos_dict[word[1]] = 1 for item in pos_dict: pos_dict[item] = pos_dict[item] / textstat.lexicon_count( article_contents) return pos_dict
def analyseText(): values = request.get_json() required = [ 'inputText' ] if not all(k in values for k in required): return 'Missing values', 400 text = values['inputText'] result = { 'syllable_count': textstat.syllable_count(text), 'lexicon_count': textstat.lexicon_count(text), 'sentence_count': textstat.sentence_count(text), 'flesch_reading_ease': textstat.flesch_reading_ease(text), 'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text), 'gunning_fog': textstat.gunning_fog(text), 'smog_index': textstat.smog_index(text), 'automated_readability_index': textstat.automated_readability_index(text), 'coleman_liau_index': textstat.coleman_liau_index(text), 'linsear_write_formula': textstat.linsear_write_formula(text), 'dale_chall_readability_score': textstat.dale_chall_readability_score(text) }; return jsonify(result), 200
def __load_text(self): tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') with codecs.open('{}/{}'.format(local_data_dir, self.filename), 'r', encoding='utf8', errors='ignore') as f: data = f.read() self.flesch_reading_ease = textstat.flesch_reading_ease(data) self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(data) sentences = tokenizer.tokenize(data) self.n_sentences = textstat.sentence_count(data) self.avg_sentence_length = textstat.lexicon_count( data, True) * 1. / self.n_sentences self.avg_word_length = np.mean([ len(w) for s in sentences for w in s.split(' ') if w not in stopwords.words('english') ]) print 'Parse ', len( sentences ), ' sentences, average sentence length ', self.avg_sentence_length, ', average word length ', self.avg_word_length self.sentences = sentences self.tokens = [] [self.tokens.extend(text_tokenize(sentence)) for sentence in sentences]
def stats(self, text): test_data = text stats = {} stats['flesch_reading_ease'] = textstat.flesch_reading_ease(test_data) stats['smog'] = textstat.smog_index(test_data) stats['flesch kincaid'] = textstat.flesch_kincaid_grade(test_data) stats['coleman Liau'] = textstat.coleman_liau_index(test_data) stats['automated'] = textstat.automated_readability_index(test_data) stats['dale chall'] = textstat.dale_chall_readability_score(test_data) stats['difficult'] = textstat.difficult_words(test_data) stats['linsear'] = textstat.linsear_write_formula(test_data) stats['gunning_fog'] = textstat.gunning_fog(test_data) stats['standard'] = textstat.text_standard(test_data) stats['charcount'] = textstat.char_count(test_data) stats['lexicon count'] = textstat.lexicon_count(test_data) stats['syllable count'] = textstat.syllable_count(test_data) stats['sentence count'] = textstat.sentence_count(test_data) stats['avg sentence length'] = textstat.avg_sentence_length(test_data) stats['avg_syllables_per_word'] = textstat.avg_syllables_per_word( test_data) stats['avg_letter_per_word'] = textstat.avg_letter_per_word(test_data) stats['avg_sentence_per_word'] = textstat.avg_sentence_per_word( test_data) return stats
def parse_HtmlResponse(self, response): item = SeedItem() item['url'] = response.url item['response_code'] = response.status item['response_type'] = 'HTML' soup = BeautifulSoup(response.body, "lxml") if soup.html.has_attr('lang'): lang = soup.html['lang'] item['declared_language'] = lang stripped_text = soup.get_text() item['num_words'] = str(textstat.lexicon_count(stripped_text)) item['fk_grade'] = textstat.flesch_kincaid_grade(stripped_text) Detector(stripped_text, True) language = Detector(stripped_text) if language.reliable: lang = language.language.code #print "detected language: " + language.language.code + " confidence:" + str(language.language.confidence) item['detected_language'] = lang num_links = 0 for href in response.css('a::attr(href)').extract(): url = response.urljoin(href) if self.language_train.is_url_predicted_in_accepted_lang(url): #print "link is predicted to be acceptable, keeping: " + url yield scrapy.Request(url=response.urljoin(href), callback=self.parse) else: print "Skipping url because of predicted language:" + url num_links += 1 item['num_links'] = num_links yield item
def text_proc(corpus, urlDat={}, WORD_LIM=30, verbose=False): if type(corpus) is type(str()) and corpus not in str( "Redirecting"): # and not str("privacy policy") in corpus: if str("some error has occurred while processing your request" ) in corpus: return {} if str("We apologize for the inconvenience...") in corpus: return {} # if np.mean([len(w) for w in corpus]) > 35: # return {} corpus = corpus.replace("/", " ") # remove characters that nltk can't read corpus = corpus.lower() corpus = corpus.replace(u"\xa0", u" ") corpus = corpus.replace(u"\\", u" ") corpus, this_is_science = extract_science_block(corpus) if "semantic" in urlDat.keys(): if urlDat["semantic"]: this_is_science = True urlDat["science"] = this_is_science # print(corpus) # print(this_is_science, "this_is_science") urlDat["big_words"] = [word for word in corpus if len(word) > 16] ignoreSingleSentences = 1 corpus = cleanup_pretagger_all(corpus) if verbose: st.text("pretagger all") st.text(type(corpus)) tokens = word_tokenize(corpus) if verbose: st.text("token input") st.text(tokens) tokens = [t for t in tokens if t not in not_want_list] # if np.mean([len(t) for t in tokens]) > 50: # return {} # tokens = [t for t in tokens if len(t) < 50] # if verbose: # st.text("token input") # st.text(tokens) wc, sc, sylCount, remainingText, wordLen = countWordsSentSyl( tokens, ignoreSingleSentences=1) if len(tokens) < WORD_LIM: return {} if len(tokens) >= WORD_LIM: remainingText = " ".join(remainingText) remainingText = remainingText.lower() urlDat["standard"] = textstat.text_standard(remainingText, float_output=True) # st.markdown(urlDat["standard"]) if wc > 0 and sc > 0: if "semantic" in urlDat.keys() or urlDat["standard"] > 95: # else: # urlDat["hard_snippet"] = None urlDat["fre_unbiased"] = freeAlongtheText(corpus, chunk_length=512) fre = FRE(wc, sc, sylCount) if "semantic" in urlDat.keys(): if urlDat["semantic"]: ndc = NDC( remainingText, wc, sc ) # calc NDC Index and Perctage Diff Words #calc NDC index # if not "fre_unbiased" in urlDat.keys() and urlDat["standard"]>100: meanv, total, hard_snippet = complexityAlongtheText( corpus, chunk_length=256) urlDat["standard_unbiased"] = meanv # urlDat["standard"] = total # if this_is_science: if "semantic" in urlDat.keys(): urlDat["hard_snippet"] = hard_snippet # urlDat["fre"] = fre # textstat.text_standard(corpus, float_output=True) # urlDat["standard"] = ndc[0] # https://stackoverflow.com/questions/62492797/get-bibliography-list-and-its-count-from-text-python # print(urlDat["standard"]) """ if "fre_unbiased" in urlDat.keys(): if ( urlDat["fre_unbiased"] < urlDat["standard"] and urlDat["fre_unbiased"] > 0 ): urlDat["standard"] = urlDat["fre_unbiased"] if urlDat["standard"] == 0 and urlDat["fre_unbiased"] > 0: urlDat["standard"] = urlDat["fre_unbiased"] """ # if ( # urlDat["standard_unbiased"] < urlDat["standard"] # and urlDat["standard_unbiased"] > 0 # ): # urlDat["standard"] = urlDat["standard_unbiased"] # if fre<urlDat["standard"] and fre>0: # urlDat["standard"] = fre # if urlDat["standard"] > 60 and ndc[0]>0 and ndc[0]<60: # urlDat["standard"] = ndc[0] # urlDat["concensus"] = np.mean( # [ # np.mean(fre), # np.mean(urlDat["standard_unbiased"]), # ] # ) tokens = [w.lower() for w in tokens if w.isalpha()] tokens = [w.lower() for w in tokens] # make everything lower case urlDat["wcount"] = textstat.lexicon_count(str(tokens)) word_lim = bool(urlDat["wcount"] > WORD_LIM) # print(urlDat["tokens"]) if len(tokens): if "semantic" in urlDat.keys(): urlDat["tokens"] = tokens lexicon = textstat.lexicon_count(corpus, True) urlDat["uniqueness"] = len(set(tokens)) / float(len(tokens)) urlDat["unique_words"] = len(set(tokens)) # It's harder to have a good unique ratio in a long document, as 'and', 'the' and 'a', will dominate. # big deltas mean redudancy/sparse information/information/density testimonial = TextBlob(corpus) urlDat["sp"] = testimonial.sentiment.polarity urlDat["ss"] = testimonial.sentiment.subjectivity urlDat["sp_norm"] = np.abs(testimonial.sentiment.polarity) urlDat["ss_norm"] = np.abs(testimonial.sentiment.subjectivity) urlDat["gf"] = textstat.gunning_fog(corpus) if "standard" in urlDat.keys(): if urlDat["standard"] == 0: if verbose: st.text("gets here") # return {} return urlDat
#main script if __name__ == '__main__': print "TextStat Comparison Script" print "--------------------------" #read in text from the command line #This needs to be fixed to deal/escape special characters textToCheck = raw_input("Please enter the text you would like to analyse: ") #read in text from a file- but what format? print "\n\n" print "Results" print "==============================================" print "==============================================\n" print "Syllable Count: " + str(textstat.syllable_count(textToCheck)) print "Lexicon Count: " + str(textstat.lexicon_count(textToCheck)) #TRUE is default and removes punctuation before counting print "Sentence Count: " + str(textstat.sentence_count(textToCheck)) print "Flesch Reading Ease formula: " + str(textstat.flesch_reading_ease(textToCheck)) print "Flesch-Kincaid Grade Level: " + str(textstat.flesch_kincaid_grade(textToCheck)) print "Fog Scale (Gunning FOG Formula): " + str(textstat.gunning_fog(textToCheck)) print "SMOG Index: " + str(textstat.smog_index(textToCheck)) print "Automated Readability Index: " + str(textstat.automated_readability_index(textToCheck)) print "Coleman-Liau Index: " + str(textstat.coleman_liau_index(textToCheck)) print "Linsear Write Formula: " + str(textstat.linsear_write_formula(textToCheck)) print "Dale-Chall Readability Score: " + str(textstat.dale_chall_readability_score(textToCheck)) print "--------------------------------------------------------------" print "Readability Consensus based upon all the above tests: " + str(textstat.text_standard(textToCheck)) print "\n\n"
with open(data_file, "r", encoding='UTF-8') as file: for test_data in file: test_data = test_data.replace("\n", "") print(test_data) print( "-------------------------Text Statistic-----------------------------------" ) print("Returns the number of syllables present in the given text.") # print(textstat.syllable_count(test_data, lang='en_US')) num_syllables = textstat.syllable_count(test_data, lang='en_US') print(num_syllables) print( "Calculates the number of words present in the text - punctuation removed" ) # print(textstat.lexicon_count(test_data, removepunct=True)) num_words = textstat.lexicon_count(test_data, removepunct=True) print(num_words) print("Returns the number of sentences present in the given text.") # print(textstat.sentence_count(test_data)) num_sentences = textstat.sentence_count(test_data) print(num_sentences) print("difficult words") # print(textstat.difficult_words(test_data)) num_difficult_words = textstat.difficult_words(test_data) print(num_difficult_words) print( "-------------------------Difficulty------------------------------" ) print("The Flesch Reading Ease Score") # print(textstat.flesch_reading_ease(test_data))
def getReadTimeNewUser(content): avg_murica = 200 #wpm return textstat.lexicon_count(content) / (avg_murica / 60)
strip_nums = str.maketrans('', '', digits) df['text'] = df['text'].apply(lambda x: x.translate(remove_digits)) # Print pro/con text of one review. #df.loc[6,'pros'] #df.loc[6,'cons'] #df.loc[7,'text'] # Create features for grade level, reading ease, word counts, sentence count, and paragraphs. # Source: https://pypi.python.org/pypi/textstat/ # Note: \r = paragraph break. \n = white space. df['read_ease_grade'] = df['text'].apply( lambda x: textstat.flesch_kincaid_grade(x)) df['sentence_count'] = df['text'].apply(lambda x: textstat.sentence_count(x)) df['word_count'] = df['text'].apply(lambda x: textstat.lexicon_count(x)) df['word_count_squared'] = (df['word_count'])**2 df['paragraph'] = df['pros'].apply(lambda x: x.count('\r')) + df['cons'].apply( lambda x: x.count('\r')) df['text_ratio'] = (df['textLengthPro'] - df['textLengthCon']) / ( (df['textLengthPro'] + df['textLengthCon'])) ################################################################ #### Stop words, tokenize, stemming. ################################################################ #### Tokenize text. tokenizer = RegexpTokenizer(r'\w+') df['tokens'] = df['text'].apply(lambda x: tokenizer.tokenize(x)) #### Stem text.
# comma_count try: comma_count = count_comma(AB) except: warning_message = 1 # num_syllables try: num_syllables = textstat.syllable_count(AB) except: warning_message = 1 # word_count try: word_count = textstat.lexicon_count(AB) except: warning_message = 1 # avg_word_len try: avg_word_len = avg_word_length(AB) except: warning_message = 1 # flesch_score try: flesch_score = textstat.flesch_reading_ease(AB) except: warning_message = 1
# Build Dataset try: cur = { "title": title, "artist": artist, "year": year, "pos": pos, "lyrics": lyrics, "tags": get_tags(artist), "sentiment": sent_analyzer.polarity_scores(lyrics_repl), "f_k_grade": ts.flesch_kincaid_grade(lyrics_repl), "flesch_index": ts.flesch_reading_ease(lyrics_repl), "fog_index": ts.gunning_fog(lyrics_repl), "difficult_words": ts.difficult_words(lyrics_repl), "num_syllables": ts.syllable_count(lyrics_repl), "num_words": ts.lexicon_count(lyrics_repl, True), "num_lines": ts.sentence_count(lyrics_repl), "num_dupes": count_dupes(lyrics) } # print cur dataset.append(cur) except Exception, e: print e except Exception, e: print "Exception occurred for " + artist + ' - ' + title print e outfile = "years/" + str(year) + '.txt' dir = os.path.dirname(outfile) if not os.path.exists(dir):
def get_word_count(string): return textstat.lexicon_count(string, False)
from textstat.textstat import textstat import pandas as pd key = 'adbbd909ff7241929e6a6c6a5e938f3f' archive = ArchiveAPI(key) data = [] for year in range(1950, 2016): for month in range(1, 13): contents = archive.query(year, month) date = str(year) + '-' + str(month) print date headlines = [] total = 0.0 count = 0.0 length = 0.0 for articles in contents['response']['docs']: #print articles count = count + 1 length = length + textstat.lexicon_count(str(articles['headline'])) total = total + textstat.flesch_reading_ease( str(articles['headline'])) data.append((date, total / count, length / count)) print count print data labels = ['date', 'flesch_reading_ease', 'average_length'] df = pd.DataFrame.from_records(data, columns=labels) df.to_csv('headlines.csv') print df #print data['1950-12'] #if articles['news_desk'] == 'National Desk'or articles['news_desk'] == None:
if __name__ == '__main__': # prompt user for file and open it user_input = input("Enter file name to open: ") input_string = open(user_input).read() user_input = input("Enter file name to write to: ") # declare/initialize lists copy_string = input_string.split() words_with_synonyms = [] the_synonyms = [] # get number of syllables, words, sentences, and FK score for the file num_syllables = textstat.syllable_count(input_string) num_words = textstat.lexicon_count(input_string) num_sentences = textstat.sentence_count(input_string) fk_score = 206.835 - float(1.105 * (num_words / num_sentences)) - float( 84.6 * (num_syllables / num_words)) # print number of syllables, words, and sentences in the file print("\nNumber of syllables: ", num_syllables) print("Number of words: ", num_words) print("Number of sentences: ", num_sentences) output = synonym_replacement(input_string, copy_string) #output = remove_adjective(output, copy_string) initial_grade = check_reading_level(input_string) new_grade = check_reading_level(output) new_num_syllables = textstat.syllable_count(output)
#!/bin/python import sys, string, os from textstat.textstat import textstat inputfile = '' test_data = "" script_name = sys.argv[0] inputfile = sys.argv[1] with open(inputfile) as myfile: test_data="".join(line.rstrip() for line in myfile) var1 = str(textstat.flesch_reading_ease(test_data)) var2 = str(textstat.smog_index(test_data)) var3 = str(textstat.flesch_kincaid_grade(test_data)) var4 = str(textstat.coleman_liau_index(test_data)) var5 = str(textstat.automated_readability_index(test_data)) var6 = str(textstat.dale_chall_readability_score(test_data)) var7 = str(textstat.difficult_words(test_data)) var8 = str(textstat.linsear_write_formula(test_data)) var9 = str(textstat.gunning_fog(test_data)) var10 = str(textstat.readability_consensus(test_data)) var11 = str(textstat.syllable_count(test_data)) var12 = str(textstat.lexicon_count(test_data, 1)) var13 = str(textstat.sentence_count(test_data)) print(var1 + ',' + var2 + ',' + var3 + ',' + var4 + ',' + var5 + ',' + var6 + ',' + var7 + ',' + var8 + ',' + var9 + ',' + var10 + ',' + var11 + ',' + var12 + ',' + var13)
def calculate_number_of_lexicons(review): if len(review) > 0: return math.sqrt(math.sqrt(textstat.lexicon_count(review))) else: return 0
response = opener.open(qqll) rt = response raw_html = response.read() g = goose.Goose() a = g.extract(raw_html=raw_html) htext = a.cleaned_text opinion = TextBlob(htext) pol = opinion.sentiment.polarity sub = opinion.sentiment.subjectivity rt = requests.get(qqll).elapsed.total_seconds() kw = str(keywords(htext, lemmatize=True)) kw = kw.replace('\r', ' ').replace('\n', ' ') keyw = ' '.join(kw.split()[:3]) sbody = htext.replace(',', '') fkg = textstat.flesch_kincaid_grade(htext) wc = textstat.lexicon_count(htext) sc = textstat.sentence_count(htext) fre = textstat.flesch_reading_ease(htext) sinsite = [ 'response time', 'subjective', 'polarity', 'fgrade', 'fscore', 'words.counts', 'sentence.count', 'keywords', 'title', 'link', 'text' ] wr.writerow(sinsite) insite = [rt, sub, pol, fkg, fre, wc, sc, keyw, a.title, qqll] wr.writerow(insite) rec = re.compile(r"https?://(www\.)?") zz = rec.sub('', qqll).strip().strip('/') with open('rowTwittersite.csv', 'w') as tsout: wr = csv.writer(tsout, quoting=csv.QUOTE_ALL)
def lexicon_count_diff(q1, q2): return textstat.lexicon_count(q1) - textstat.lexicon_count(q2)
def __init__(self, path): """ Create document instance for analysis. Opens and reads document to string raw_text. Textract interprets the document format and opens to plain text string (docx, pdf, odt, txt) Args: path (str): path to file to open, anaylze, close Public attributes: -user: (str) optional string to set username. -path: (str) relative path to document. -abs_path: (str) the absolute path to the document. -file_name: (str) the file name with extension of document (base name). -mime: tbd -guessed_type: makes best guess of mimetype of document. -file_type: returns index[0] from guessed_type. -raw_text: (str) plain text extracted from .txt, .odt, .pdf, .docx, and .doc. -ptext: (str) raw text after a series of regex expressions to eliminate special characters. -text_no_feed: (str) ptext with most new line characters eliminated /n/n stays intact. -sentence_tokens: list of all sentences in a comma separated list derived by nltk. -sentence_count: (int) count of sentences found in list. -passive_sentences: list of passive sentences identified by the passive module. -passive_sentence_count: count of the passive_sentences list. -percent_passive: (float) ratio of passive sentences to all sentences in percent form. -be_verb_analysis: (int) sum number of occurrences of each to be verb (am, is, are, was, were, be, being been). -be_verb_count: tbd -be_verb_analysis: tbd -weak_sentences_all: (int) sum of be verb analysis. -weak_sentences_set: (set) set of all sentences identified as having to be verbs. -weak_sentences_count: (int) count of items in weak_sentences_set. -weak_verbs_to_sentences: (float) proportion of sentences with to be to all sentences in percent (this might not be sound). -word_tokens: list of discreet words in text that breaks contractions up (default nltk tokenizer). -word_tokens_no_punct: list of all words in text including contractions but otherwise no punctuation. -no_punct: (str) full text string without sentence punctuation. -word_tokens_no_punct: uses white-space tokenizer to create a list of all words. -readability_flesch_re: (int) Flesch Reading Ease Score (numeric score) made by textstat module. -readability_smog_index: (int) grade level as determined by the SMOG algorithum made by textstat module. -readability_flesch_kincaid_grade: (int) Flesch-Kincaid grade level of reader made by textstat module. -readability_coleman_liau_index: (int) grade level of reader as made by textstat module. -readability_ari: (int) grade leader of reader determined by automated readability index algorithum implemented by textstat. -readability_linser_write: FIX SPELLING grade level as determined by Linsear Write algorithum implemented by textstat. -readability_dale_chall: (int) grade level based on Dale-Chall readability as determined by textstat. -readability_standard: composite grade level based on readability algorithums. -flesch_re_key: list for interpreting Flesch RE Score. -word_count: word count of document based on white space tokener, this word count should be used. -page_length: (float) page length in decimal format given 250 words per page. -paper_count: (int) number of printed pages given 250 words per page. -parts_of_speech: words with parts of speech tags. -pos_counts: values in word, tag couple grouped in a list (Counter). -pos_total: (int) sum of pos_counts values -pos_freq: (dict) word, ratio of whole -doc_pages: (float) page length based on 250 words per page (warning, this is the second time this attribute is defined). -freq_words: word frequency count not standardized based on the correct word tokener (not ratio, just count). modal_dist: count of auxillary verbs based on word_tokens_no_punct. sentence_count (int): Count the sentence tokens passive_sentences (list): List of all sentences identified as passive passive_sentence_count (int): count of items in passive_sentences be_verb_count (int): count "to be" verbs in text word_tokens_no_punct (list): words separated, stripped of punctuation, made lower case flesch_re_key (str): reading ease score to description freq_words (list or dict): frequency distribution of all words modal_dist (list): frequency distribution of aux verbs """ self.user = "" self.path = path self.abs_path = os.path.abspath(self.path) if os.path.isfile(self.path): self.time_stamp = self.timestamp() self.file_name = os.path.basename(path) self.mime = MimeTypes() self.guessed_type = self.mime.guess_type(self.path) self.file_type = self.guessed_type[0] self.raw_text = textract.process(self.path, encoding="ascii") self.ptext = re.sub(u'[\u201c\u201d]', '"', self.raw_text) self.ptext = re.sub(u"\u2014", "--", self.ptext) self.ptext = re.sub(",", ",", self.ptext) self.ptext = re.sub("—", "--", self.ptext) self.ptext = re.sub("…", "...", self.ptext) self.text_no_feed = self.clean_new_lines(self.ptext) self.sentence_tokens = self.sentence_tokenize(self.text_no_feed) self.sentence_count = len(self.sentence_tokens) self.passive_sentences = passive(self.text_no_feed) self.passive_sentence_count = len(self.passive_sentences) self.percent_passive = (100 * (float(self.passive_sentence_count) / float(self.sentence_count))) self.percent_passive_round = round(self.percent_passive, 2) self.be_verb_analysis = self.count_be_verbs(self.sentence_tokens) self.be_verb_count = self.be_verb_analysis[0] self.weak_sentences_all = self.be_verb_analysis[1] self.weak_sentences_set = set(self.weak_sentences_all) self.weak_sentences_count = len(self.weak_sentences_set) self.weak_verbs_to_sentences = 100 * float( self.weak_sentences_count) / float(self.sentence_count) self.weak_verbs_to_sentences_round = round( self.weak_verbs_to_sentences, 2) self.word_tokens = self.word_tokenize(self.text_no_feed) self.word_tokens_no_punct = \ self.word_tokenize_no_punct(self.text_no_feed) self.no_punct = self.strip_punctuation(self.text_no_feed) # use this! It make lower and strips symbols self.word_tokens_no_punct = self.ws_tokenize(self.no_punct) self.readability_flesch_re = \ textstat.flesch_reading_ease(self.text_no_feed) self.readability_smog_index = \ textstat.smog_index(self.text_no_feed) self.readability_flesch_kincaid_grade = \ textstat.flesch_kincaid_grade(self.text_no_feed) self.readability_coleman_liau_index = \ textstat.coleman_liau_index(self.text_no_feed) self.readability_ari = \ textstat.automated_readability_index(self.text_no_feed) self.readability_linser_write = \ textstat.linsear_write_formula(self.text_no_feed) self.readability_dale_chall = \ textstat.dale_chall_readability_score(self.text_no_feed) self.readability_standard = \ textstat.text_standard(self.text_no_feed) self.flesch_re_desc_str = self.flesch_re_desc( int(textstat.flesch_reading_ease(self.text_no_feed))) self.polysyllabcount = textstat.polysyllabcount(self.text_no_feed) self.lexicon_count = textstat.lexicon_count(self.text_no_feed) self.avg_syllables_per_word = textstat.avg_syllables_per_word( self.text_no_feed) self.avg_sentence_per_word = textstat.avg_sentence_per_word( self.text_no_feed) self.avg_sentence_length = textstat.avg_sentence_length( self.text_no_feed) self.avg_letter_per_word = textstat.avg_letter_per_word( self.text_no_feed) self.difficult_words = textstat.difficult_words(self.text_no_feed) self.rand_passive = self.select_random(self.passive_sentence_count, self.passive_sentences) self.rand_weak_sentence = self.select_random( len(self.weak_sentences), self.weak_sentences) if self.word_tokens_no_punct: self.word_count = len(self.word_tokens_no_punct) self.page_length = float(self.word_count) / float(250) self.paper_count = int(math.ceil(self.page_length)) self.parts_of_speech = pos_tag(self.word_tokens_no_punct) self.pos_counts = Counter( tag for word, tag in self.parts_of_speech) self.pos_total = sum(self.pos_counts.values()) self.pos_freq = dict( (word, float(count) / self.pos_total) for word, count in self.pos_counts.items()) self.doc_pages = float(float(self.word_count) / float(250)) self.freq_words = \ self.word_frequency(self.word_tokens_no_punct) self.modal_dist = self.modal_count(self.word_tokens_no_punct) # self.ws_tokens = self.ws_tokenize(self.text_no_cr) self.pos_count_dict = self.pos_counts.items() # Model - use for any pos self.modals = self.pos_isolate('MD', self.pos_count_dict) self.preposition_count = self.pos_isolate('IN', self.pos_count_dict) self.adjective_count = self.pos_isolate_fuzzy( 'JJ', self.pos_count_dict) self.adverb_count = self.pos_isolate_fuzzy('RB', self.pos_count_dict) self.proper_nouns = self.pos_isolate_fuzzy('NNP', self.pos_count_dict) self.cc_count = self.pos_isolate('CC', self.pos_count_dict) self.commas = self.char_count(",") self.comma_sentences = self.list_sentences(",") self.comma_example = self.select_random(len(self.comma_sentences), self.comma_sentences) self.semicolons = self.char_count(";") self.semicolon_sentences = self.list_sentences(";") self.semicolon_example = self.select_random( len(self.semicolon_sentences), self.semicolon_sentences) self.lint_suggestions = lint(self.raw_text)