def text_analytics(text): if textstat.sentence_count(text) != 0: lexicon = textstat.lexicon_count(text) #word count sent = textstat.sentence_count(text) #sentence count syll = textstat.syllable_count(text) #syllable count flesch = textstat.flesch_reading_ease(text) #flesch score smog = textstat.smog_index(text) #SMOG index fog = textstat.gunning_fog(text) #FOG index dale = textstat.dale_chall_readability_score(text) #grade level ari = textstat.automated_readability_index(text) #grade level cl = textstat.coleman_liau_index(text) #grade level flesch1 = lexicon*flesch flesch2 = sent*flesch flesch3 = syll*flesch smog1 = lexicon*smog smog2 = sent*smog smog3 = syll*smog fog1 = lexicon*fog fog2 = sent*fog fog3 = syll*fog dale1 = lexicon*dale dale2 = sent*dale dale3=syll*dale ari1 = lexicon*ari ari2 = sent*ari ari3 = syll*ari cl1 = lexicon*cl cl2 = sent*cl cl3 = syll*cl x=[lexicon,sent,syll,flesch,smog,fog,dale,ari,cl,flesch1,flesch2,flesch3,smog1, smog2,smog3,fog1,fog2,fog3,dale1,dale2,dale3,ari1,ari2,ari3,cl1,cl2,cl3] return(x)
def calculate_statistics(lyrics): """ Calculates statistics based on the text_raw of the lyrics. :return: Annotated lyrics containing information about the songs """ logging.info("Calculating Statistics") from textstat.textstat import textstat for idx, song in tqdm(enumerate(lyrics), total=len(lyrics)): try: song["num_syllables"] = textstat.syllable_count(song["text_raw"]) song["num_words"] = textstat.lexicon_count(song["text_raw"]) song["num_sentences"] = textstat.sentence_count(song["text_raw"]) song["flesch_score"] = textstat.flesch_reading_ease( song["text_raw"]) song["flesch_kincaid_level"] = textstat.flesch_kincaid_grade( song["text_raw"]) song["fog_score"] = textstat.gunning_fog(song["text_raw"]) song[ "num_difficult_words"] = textstat.dale_chall_readability_score( song["text_raw"]) except Exception as e: logging.error( "Something bad happened in the current song ! Skipping it... \n{}" .format(song)) logging.exception(e) return lyrics
def _get_detailed_stats(no_code_text): """ Returns detailed stats on text :param no_code_text: String to analyse :return: list of details """ results = [] group_by = 'Detailed Text Statistics' tb = TextBlob(no_code_text) # Spell check here...it's very slow results.append( TextFeature('Number of sentences', textstat.sentence_count(no_code_text), group_by)) results.append( TextFeature('Number of sentences (again)', len(tb.sentences), group_by)) results.append(TextFeature('Number of words', len(tb.words), group_by)) results.append( TextFeature('Sentiment Polarity', tb.sentiment.polarity, group_by)) results.append( TextFeature('Sentiment Subjectivity', tb.sentiment.subjectivity, group_by)) results.append( TextFeature('Detected Language', tb.detect_language(), group_by)) results.append( TextFeature('Number of important phrases', len(tb.noun_phrases), group_by)) results.append( TextFeature('Number of word bi-grams', len(tb.ngrams(2)), group_by)) results.append( TextFeature('Number of word tri-grams', len(tb.ngrams(3)), group_by)) results.append( TextFeature('Number of word 4-grams', len(tb.ngrams(4)), group_by)) return results
def _calculate_scores(self, docs): docs_scores = [] for doc in docs: scores = {} scores['chars'] = ts.char_count(doc) scores['words'] = ts.lexicon_count(doc) scores['sents'] = ts.sentence_count(doc) #scores['syllables'] = ts.syllable_count(doc) scores['avg_sent_length'] = ts.avg_sentence_length(doc) scores['avg_syllables_per_word'] = ts.avg_syllables_per_word(doc) scores['avg_letters_per_word'] = ts.avg_letter_per_word(doc) scores['flesch'] = ts.flesch_reading_ease(doc) #scores['smog'] = ts.smog_index(doc) #scores['coleman_liau'] = ts.coleman_liau_index(doc) scores['automated_readability'] = ts.automated_readability_index( doc) #scores['linsear'] = ts.linsear_write_formula(doc) #scores['difficult_words'] = ts.difficult_words(doc) scores['dale_chall'] = ts.dale_chall_readability_score(doc) #scores['gunning_fog'] = ts.gunning_fog(doc) scores['lix'] = ts.lix(doc) docs_scores.append(scores) return docs_scores
def get_special_metrics(text): blob = TextBlob(text) main = { "statistics": { "syllables": textstat.syllable_count(text), "words": textstat.lexicon_count(text), "characters": textstat.char_count(text), "polysyllables": textstat.polysyllabcount(text), "average letter per word": textstat.avg_letter_per_word(text), "average sentence length": textstat.avg_sentence_length(text), "average sentence per word": textstat.avg_sentence_per_word(text), "sentences": textstat.sentence_count(text), }, "difficulty": { "flesch reading ease": textstat.flesch_reading_ease(text), "smog index": textstat.smog_index(text), "flesch kincaid grade": textstat.flesch_kincaid_grade(text), "coleman liau index": textstat.coleman_liau_index(text), #'automated readability index': textstat.automated_readability_index(text), #'dale chall readability score': textstat.dale_chall_readability_score(text), #'difficult words': textstat.difficult_words(text), #'linsear write formula': textstat.linsear_write_formula(text), "gunning fog": textstat.gunning_fog(text), }, "sentiments": {"polarity": blob.sentiment.polarity, "subjectivity": blob.sentiment.subjectivity}, } return main
def do_text_stats(self, text): ### Syllable Count syllable_count = textstat.syllable_count(text) ### Lexicon Count lexicon_count = textstat.lexicon_count(text, True) ### Sentence Count sentence_count = textstat.sentence_count(text) ### The Flesch Reading Ease formula try: flesch_reading_ease = textstat.flesch_reading_ease(text) except TypeError as e: flesch_reading_ease = None #* 90-100 : Very Easy #* 80-89 : Easy #* 70-79 : Fairly Easy #* 60-69 : Standard #* 50-59 : Fairly Difficult #* 30-49 : Difficult #* 0-29 : Very Confusing ### The The Flesch-Kincaid Grade Level try: flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) except TypeError as e: flesch_kincaid_grade = None ## The Fog Scale (Gunning FOG Formula) gunning_fog = textstat.gunning_fog(text) ### The SMOG Index smog_index = textstat.smog_index(text) ### Automated Readability Index automated_readability_index = textstat.automated_readability_index( text) ### The Coleman-Liau Index try: coleman_liau_index = textstat.coleman_liau_index(text) except TypeError as e: coleman_liau_index = None ### Linsear Write Formula linsear_write_formula = textstat.linsear_write_formula(text) ### Dale-Chall Readability Score dale_chall_readability_score = textstat.dale_chall_readability_score( text) ### Readability Consensus based upon all the above tests try: text_standard = textstat.text_standard(text) except TypeError as e: text_standard = None return { "syllable_count": syllable_count, "lexicon_count": lexicon_count, "sentence_count": sentence_count, "flesch_reading_ease": flesch_reading_ease, "flesch_kincaid_grade": flesch_kincaid_grade, "gunning_fog": gunning_fog, "smog_index": smog_index, "automated_readability_index": automated_readability_index, "coleman_liau_index": coleman_liau_index, "linsear_write_formula": linsear_write_formula, "dale_chall_readability_score": dale_chall_readability_score, "text_standard": text_standard }
def get_special_metrics(text): blob = TextBlob(text) main = { 'statistics': { 'syllables': textstat.syllable_count(text), 'words': textstat.lexicon_count(text), 'characters': textstat.char_count(text), 'polysyllables': textstat.polysyllabcount(text), 'average letter per word': textstat.avg_letter_per_word(text), 'average sentence length': textstat.avg_sentence_length(text), 'average sentence per word': textstat.avg_sentence_per_word(text), 'sentences': textstat.sentence_count(text) }, 'difficulty': { 'flesch reading ease': textstat.flesch_reading_ease(text), 'smog index': textstat.smog_index(text), 'flesch kincaid grade': textstat.flesch_kincaid_grade(text), 'coleman liau index': textstat.coleman_liau_index(text), #'automated readability index': textstat.automated_readability_index(text), #'dale chall readability score': textstat.dale_chall_readability_score(text), #'difficult words': textstat.difficult_words(text), #'linsear write formula': textstat.linsear_write_formula(text), 'gunning fog': textstat.gunning_fog(text) }, 'sentiments': { 'polarity': blob.sentiment.polarity, 'subjectivity': blob.sentiment.subjectivity } } return main
def feature_apply(feature_extractor, feature_vector, attribute, number_of_file): """ Extract features from each document :param feature_extractor: function that extract features :param feature_vector: contains a list of features :param attribute: indicate if the process for gender or age feature extraction :param number_of_file: number of document to be processed :return:vector that contain the extracted features """ corpus_root = '/root/Downloads/TextMining/pan13-author-profiling-training-corpus-2013-01-09/en' #corpus_root = '/root/Downloads/TextMining/pan13-author-profiling-training-corpus-2013-01-09/meTets' newcorpus = XMLCorpusReader(corpus_root, '.*') i=0 feature_set = [] doc_list = newcorpus.fileids() print len(doc_list) for doc in doc_list[:number_of_file]: i+=1 if i%50==0: print i doc = newcorpus.xml(doc) number_of_conversation=int(doc[0].attrib["count"]) #print(doc[0].attrib["count"]) txt = " ".join([doc[0][j].text for j in range(number_of_conversation) if doc[0][j].text is not None]) #print txt if textstat.sentence_count(txt) != 0: feature_set.append((feature_extractor(txt, feature_vector), doc.attrib[attribute])) return feature_set
def main(): csv_file2 = open(sys.argv[2], 'w', encoding="utf8") writer = csv.writer(csv_file2, delimiter=',') doc_id = 1 writer.writerow(["ID", "URL", "text", "impact-score", "readability", "grade-level", "smog-index", "total-words", "total-sentences"]) with open(sys.argv[1], 'r', encoding="utf8", errors='ignore') as csv_file1: reader = csv.reader(csv_file1) # Skip the first line with headers next(reader) for row in reader: impact = str(row[0]) url = str(row[1]) text = str(row[2]) read_ease = textstat.flesch_reading_ease(text) grade = textstat.flesch_kincaid_grade(text) smog = textstat.smog_index(text) words = textstat.lexicon_count(text) sentences = textstat.sentence_count(text) # Uncomment this if we want summary and key words # summary = summarize(text, ratio=0.3) # key_words = keywords(text, ratio=0.3) writer.writerow([doc_id]+[url]+[text]+[impact]+[read_ease]+[grade]+[smog]+[words]+[sentences]) doc_id = doc_id+1 csv_file1.close() csv_file2.close() print('Summary statistics complete!')
def analyse_plain_text(test_data): text_stats = TextStats() # Do some simple analysis. from textblob import TextBlob zen = TextBlob(test_data) text_stats.word_count = len(zen.words) text_stats.sentence_count = len(zen.sentences) text_stats.polarity = zen.sentiment.polarity text_stats.subjectivity = zen.sentiment.subjectivity # Easy to read, this? from textstat.textstat import textstat text_stats.flesch_reading_ease = textstat.flesch_reading_ease(test_data) # Words per sentence count. from textstat.textstat import textstat text_stats.word_per_sentence_count = ( textstat.lexicon_count(test_data, False) / textstat.sentence_count(test_data)) # Convert all to lower. test_data = test_data.lower() # Tokenise. from nltk.tokenize import word_tokenize words = word_tokenize(test_data) # Tokenise stemmed text. from nltk.stem import PorterStemmer ps = PorterStemmer() test_data_stemmed = '' for w in words: test_data_stemmed = test_data_stemmed + ' ' + ps.stem(w) stemmed_words = word_tokenize(test_data_stemmed) # Remove non-words. nonPunct = re.compile('.*[A-Za-z0-9].*') # must contain a letter or digit filtered = [w for w in stemmed_words if nonPunct.match(w)] # Remove stopwords: from nltk.corpus import stopwords stopwords = set(stopwords.words('english')) extra_stopwords = set([ 'that', '\'s', 'wa', 'thi', 'like', 'n\'t', 'would', 'ha', 'us', 'get' ]) filtered = [ w for w in filtered if w not in stopwords and w not in extra_stopwords ] # How many unique words? from collections import Counter counts = Counter(filtered) text_stats.unique_word_count = len(counts) # Words sorted by most common. text_stats.counts = counts return text_stats
def flesch_kincaid_score(text): sylCount = textstat.syllable_count(text) wordCount = len(text.split()) sentenceCount = textstat.sentence_count(text) print "Syl count - %s, word count - %s, sentenceCount - %s " % (sylCount,wordCount,sentenceCount) return (0.39*(wordCount/sentenceCount)+11.8*(sylCount/wordCount) - 15.59)
def composition(text, file): char_count = textstat.char_count(text) syll_count = textstat.syllable_count(text) lex_count = textstat.lexicon_count(text) sent_count = textstat.sentence_count(text) file.write( '\nChar count : %d\nSyllabus count : %d \nLexicon count : %d \nSentence count : %d' % (char_count, syll_count, lex_count, sent_count))
def get_stats(sentence): syllables = textstat.syllable_count(sentence) words = textstat.lexicon_count(sentence, True) sentence_count = textstat.sentence_count(sentence) if sentence_count > 0: text_standard = textstat.text_standard(sentence) else: text_standard = EMPTY_TEXT_STANDARD text_standard = fix_grammar_errors(text_standard) return combine(syllables, words, sentence_count, text_standard)
def displayResults( path ): print "stats" text = loadText(path) raw_tokens = raw_tokenize(text) print "number of words %s" %count_words(text) print "number of sentences %s" %textstat.sentence_count(text) print "uniques words: %s" %len(set(raw_tokenize(text))) print "Difficulty %s / 100 " %(100 - textstat.flesch_reading_ease(text)) print "Average sentiment %s (negative: 0, neutral: 5, positive: 10)"%calculateSentiment(raw_tokens) print print "topic distribution" displayTopicsDistributionWithinTheText(path, 300, pie = False) print "difficulty over the text " complexityAlongtheText( path, 300)
def __load_text(self): tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') with codecs.open('{}/{}'.format(local_data_dir, self.filename), 'r', encoding = 'utf8', errors = 'ignore') as f: data = f.read() self.flesch_reading_ease = textstat.flesch_reading_ease(data) self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(data) sentences = tokenizer.tokenize(data) self.n_sentences = textstat.sentence_count(data) self.avg_sentence_length = textstat.lexicon_count(data, True) * 1. / self.n_sentences self.avg_word_length = np.mean([len(w) for s in sentences for w in s.split(' ') if w not in stopwords.words('english')]) print 'Parse ', len(sentences), ' sentences, average sentence length ', self.avg_sentence_length, ', average word length ', self.avg_word_length self.sentences = sentences self.tokens = [] [self.tokens.extend(text_tokenize(sentence)) for sentence in sentences]
def averageSentenceNumber(dataset, data): count = 0 finalfeatureset = [] for author in data: sentencecount = 0 n = author['numemails'] featureset = [] for item in author['mailset']: sentencecount += textstat.sentence_count(item['text']) featureset = dataset[count]['featureSet'] featureset.append({'averageSentenceCount': float(float(sentencecount) / float(n))}) count += 1 finalfeatureset.append({'author': author['author'], 'featureSet': featureset}) return finalfeatureset
def displayResults(path): print "stats" text = loadText(path) raw_tokens = raw_tokenize(text) print "number of words %s" % count_words(text) print "number of sentences %s" % textstat.sentence_count(text) print "uniques words: %s" % len(set(raw_tokenize(text))) print "Difficulty %s / 100 " % (100 - textstat.flesch_reading_ease(text)) print "Average sentiment %s (negative: 0, neutral: 5, positive: 10)" % calculateSentiment( raw_tokens) print print "topic distribution" displayTopicsDistributionWithinTheText(path, 300, pie=False) print "difficulty over the text " complexityAlongtheText(path, 300)
def analyze_one(self, email): """ Analyzes a single email and stores results. """ sents = tstat.sentence_count(email) self.sent_count.append(sents if sents > 0 else 1) if email and len(email) > 0: self.flesch_kincaid_grade.append(tstat.flesch_kincaid_grade(email)) self.automated_readability_index.append( tstat.automated_readability_index(email)) self.coleman_liau_index.append(tstat.coleman_liau_index(email)) self.linsear_write_formula.append( tstat.linsear_write_formula(email)) self.dale_chall_readability_score.append( tstat.dale_chall_readability_score(email))
def gettingFeatures(text): text = text.lower() #words / syllables / sentences count wordCount = len(text.split()) syllables = textstat.syllable_count(text) sentences = textstat.sentence_count(text) try: #ReadabilityScore readabilityScore = 206.835 - 1.015 * (wordCount / sentences) - 84.6 * ( syllables / wordCount) #ReadabilityGrade ReadabilityGrade = 0.39 * (wordCount / sentences) + 11.8 * ( syllables / wordCount) - 15.59 except: readabilityScore = 0 ReadabilityGrade = 0 print(readabilityScore, ReadabilityGrade) #Direction Count #private String[] direction = {"here", "there", "over there", "beyond", "nearly", "opposite", "under", "above", "to the left", "to the right", "in the distance"}; DiractionCount = 0 DiractionCount = text.count("here") + text.count("there") + text.count( "over there") + text.count("beyond") + text.count( "nearly") + text.count("opposite") + text.count( "under") + text.count("to the left") + text.count( "to the right") + text.count("in the distance") #Exemplify count #private String[] exemplify = {"chiefly", "especially", "for instance", "in particular", "markedly", "namely", "particularly", "including", "specifically", "such as"}; Exemplify = 0 Exemplify = text.count("chiefly") + text.count("especially") + text.count( "for instance") + text.count("in particular") + text.count( "markedly") + text.count("namely") + text.count( "particularly") + text.count("incluiding") + text.count( "specifically") + text.count("such as") try: #words per sentence (average) WPS = 0 parts = [len(l.split()) for l in re.split(r'[?!.]', text) if l.strip()] WPS = sum(parts) / len(parts) #number of words per sentence except: WPS = 0 #print(wordCount, readabilityScore, ReadabilityGrade, DiractionCount, WPS, Exemplify) return numpy.array([ wordCount, readabilityScore, ReadabilityGrade, DiractionCount, WPS, Exemplify ])
def main() : for arg in sys.argv[1:]: with open(arg) as f: text = f.read() with open(arg + '.readability.snip','w') as f: f.write ("syllable_count : %s\n" % textstat.syllable_count(text)) f.write ("lexicon_count : %s\n" % textstat.lexicon_count(text)) f.write ("sentence_count : %s\n" % textstat.sentence_count(text)) f.write ("difficult_words : %s\n" % textstat.difficult_words(text)) f.write ("flesch_reading_ease : %s\n" % textstat.flesch_reading_ease(text)) f.write ("flesch_kincaid_grade : %s\n" % textstat.flesch_kincaid_grade(text)) f.write ("smog_index : %s\n" % textstat.smog_index(text)) f.write ("automated_readability_index : %s\n" % textstat.automated_readability_index(text)) f.write ("coleman_liau_index : %s\n" % textstat.coleman_liau_index(text)) f.write ("linsear_write_formula : %s\n" % textstat.linsear_write_formula(text)) f.write ("dale_chall_readability_score : %s\n" % textstat.dale_chall_readability_score(text))
def process_comment(comment): login = comment["user"]["login"] body = comment["body"] yield Feature("comment_count", login, 1) if "RFR" in body and "not RFR" not in body: yield Feature('RFR', login, 1) if "RFM" in body and "not RFM" not in body: yield Feature("RFM", login, 1) if "LGTM" in body: yield Feature("LGTM", login, 1) if "PTAL" in body: yield Feature("PTAL", login, 1) if r"```" in body: yield Feature("code_block", login, 1) if r"@" in body: yield Feature("mention", login, 1) if "![" in body: yield Feature("image", login, 1) if " [ ]" in body or " [x]" in body: yield Feature("checklist", login, 1) for field in [":thumbsup:", "+1", ":ship:", ":shipit:", ":rocket:"]: if field in body: yield Feature(field, login, 1) txt = _clean_body(body) if not txt: return # yield Feature("avg_sentences_per_comment", login, textstat.sentence_count(txt)) yield Feature("sentences", login, textstat.sentence_count(txt)) if 'https://' in txt or 'http://' in txt: yield Feature('with_link', login, 1) issues = re.findall("#[0-9]{4,5}", txt) if issues: yield Feature("issue_crosslink", login, len(issues)) issues = re.findall("\b(CS|BITLY|DATA|DEVOPS)-[0-9]{3,4}\b", txt) if issues: yield Feature("jira_crosslink", login, len(issues)) if comment.get('issue_url'): issue_number = comment['issue_url'].split('/')[-1] if cached_issue_assignee(issue_number) == login: yield Feature("self_comment", login, 1)
def scores_cal_ori(text): char_count_value=textstat.char_count(text,ignore_spaces=True) lexicon_count_value=textstat.lexicon_count(text,removepunct=True) syllable_count_value=textstat.syllable_count(text) sentence_count_value=textstat.sentence_count(text) avg_sentence_length_value=textstat.avg_sentence_length(text) avg_syllables_per_word_value=textstat.avg_syllables_per_word(text) avg_letter_per_word_value=textstat.avg_letter_per_word(text) avg_sentence_per_word_value=textstat.avg_sentence_per_word(text) flesch_kincaid_grade_value=textstat.flesch_kincaid_grade(text) smog_index_value=textstat.smog_index(text) gunning_fog_value=textstat.gunning_fog(text) difficult_words_value=textstat.difficult_words(text) dale_chall_value=textstat.dale_chall_readability_score(text) polysyllab_value=textstat.polysyllabcount(text) return char_count_value,lexicon_count_value,syllable_count_value,sentence_count_value,avg_sentence_length_value,avg_syllables_per_word_value,avg_letter_per_word_value,avg_sentence_per_word_value,flesch_kincaid_grade_value,smog_index_value,gunning_fog_value,difficult_words_value,dale_chall_value,polysyllab_value return smog_index_value
def analyse_json(json_text): # consider moving this to be a feature of Transcript in the other module df_witnesses = pd.DataFrame(columns=['html_file_location', 'witness_name', 'syllable_count','lexicon_count', 'sentence_count', 'syllables_per_word', 'gunning_fog', 'smog_index', 'text_standard'], index=[]) trscrpt = json.loads(json_text) if 'witnesses' in trscrpt: witnesses = trscrpt['witnesses'] for s in trscrpt['all_sections']: if 'speaker' in s and 'person' in s['speaker'] and \ s['speaker']['person']['speaker_type']=='witness': witness = witnesses[s['speaker']['person']['name']] witness.setdefault('all_text', []).append(s['spoken_text']) for i, p in enumerate(witnesses): if 'all_text' in witnesses[p]: witness_text = '\n\n'.join(witnesses[p]['all_text']) if len(witness_text) > 0: stats_data = {'html_file_location': trscrpt['html_file_location'], 'witness_name': p, 'syllable_count': textstat.syllable_count(witness_text), 'lexicon_count': textstat.lexicon_count(witness_text), 'sentence_count': textstat.sentence_count(witness_text), 'syllables_per_word': textstat.avg_syllables_per_word(witness_text), 'gunning_fog': textstat.gunning_fog(witness_text), 'smog_index': textstat.smog_index(witness_text), 'text_standard': textstat.text_standard(witness_text)} df_witnesses.loc['witness_%i' % i] = stats_data else: df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location'] df_witnesses.loc['witness_%i' % i, 'witness_name'] = p else: df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location'] df_witnesses.loc['witness_%i' % i, 'witness_name'] = p return df_witnesses
def getReadingLevel(subreddit): query = '''SELECT body FROM (SELECT body, RAND() AS r1 FROM [fh-bigquery:reddit_comments.''' + str(year) + '''] WHERE subreddit == "''' + subreddit + '''" AND body != "[deleted]" AND body != "[removed]" AND score > 1 ORDER BY r1 LIMIT 1000) ''' bigquery_service = build('bigquery', 'v2', credentials=credentials) try: query_request = bigquery_service.jobs() query_data = { 'query': query, 'timeoutMs': 20000 } query_response = query_request.query( projectId=bigquery_pid, body=query_data).execute() except HttpError as err: print('Error: {}'.format(err.content)) raise err rows = query_response['rows'] levels_sum = 0.0 levels_count = 0 for i in range(len(rows)): text = rows[i]['f'][0]['v'] text = re.sub('([A-Za-z]+:\/\/[A-Za-z0-9]+\.[A-Za-z0-9]+[^\s-]*)|([A-Za-z]+\.[A-Za-z0-9]+\.[A-Za-z0-9]+[^\s-]*)', '', text) #url get rid text = re.sub('\s\s+', ' ', text) if textstat.sentence_count(text) > 0: levels_sum += textstat.flesch_reading_ease(text) levels_count += 1 average_level = 0.0 if levels_count > 0: average_level = levels_sum / levels_count results[subreddits.index(subreddit)] = [subreddit, 100.0 - average_level]
def compute_syllables(text): num_sentence = textstat.sentence_count(text) text = re.sub('[^A-Za-z0-9]+', ' ', text) word_list = text.split() num_simple = 0 num_complex = 0 num_syllables = 0 for i in word_list: try: syllables = nsyl(i) if syllables >= 3: num_complex += 1 num_syllables = num_syllables + syllables else: num_simple += 1 num_syllables = num_syllables + syllables except: continue return [num_simple, num_complex, num_syllables, num_sentence]
def _get_detailed_stats(no_code_text): """ Returns detailed stats on text :param no_code_text: String to analyse :return: list of details """ results = [] group_by = 'Detailed Text Statistics' tb = TextBlob(no_code_text) # Spell check here...it's very slow results.append(TextFeature('Number of sentences', textstat.sentence_count(no_code_text), group_by)) results.append(TextFeature('Number of sentences (again)', len(tb.sentences), group_by)) results.append(TextFeature('Number of words', len(tb.words), group_by)) results.append(TextFeature('Sentiment Polarity', tb.sentiment.polarity, group_by)) results.append(TextFeature('Sentiment Subjectivity', tb.sentiment.subjectivity, group_by)) results.append(TextFeature('Detected Language', tb.detect_language(), group_by)) results.append(TextFeature('Number of important phrases', len(tb.noun_phrases), group_by)) results.append(TextFeature('Number of word bi-grams', len(tb.ngrams(2)), group_by)) results.append(TextFeature('Number of word tri-grams', len(tb.ngrams(3)), group_by)) results.append(TextFeature('Number of word 4-grams', len(tb.ngrams(4)), group_by)) return results
def analyseText(): values = request.get_json() required = [ 'inputText' ] if not all(k in values for k in required): return 'Missing values', 400 text = values['inputText'] result = { 'syllable_count': textstat.syllable_count(text), 'lexicon_count': textstat.lexicon_count(text), 'sentence_count': textstat.sentence_count(text), 'flesch_reading_ease': textstat.flesch_reading_ease(text), 'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text), 'gunning_fog': textstat.gunning_fog(text), 'smog_index': textstat.smog_index(text), 'automated_readability_index': textstat.automated_readability_index(text), 'coleman_liau_index': textstat.coleman_liau_index(text), 'linsear_write_formula': textstat.linsear_write_formula(text), 'dale_chall_readability_score': textstat.dale_chall_readability_score(text) }; return jsonify(result), 200
def test_set(corpus_dir, feature_extrator, vect_path, i): """ Read ,process the test set and extract features for each document :param corpus_dir:path of the test set :param feature_extrator: function that extract features :param vect_path: :param i:index of class in the true_pred dictionay values; if 0 it refers to the gender else it refers to the age :return:vector that contain the extracted features """ vect = create_feature_vect(vect_path) newcorpus = XMLCorpusReader(corpus_dir, '.*') doc_list = newcorpus.fileids() test_feature_set = [] true_pred = extract_true_pred(corpus_dir[:-2]+"truth-en.txt") for doc in doc_list: xml_name = doc doc = newcorpus.xml(doc) print(doc[0].attrib["count"]) txt = fetch_text(doc) if (textstat.sentence_count(txt) != 0) and (txt != ""): test_feature_set.append((feature_extrator(txt, vect), true_pred[xml_name][i])) return test_feature_set
def __load_text(self): tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') with codecs.open('{}/{}'.format(local_data_dir, self.filename), 'r', encoding='utf8', errors='ignore') as f: data = f.read() self.flesch_reading_ease = textstat.flesch_reading_ease(data) self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(data) sentences = tokenizer.tokenize(data) self.n_sentences = textstat.sentence_count(data) self.avg_sentence_length = textstat.lexicon_count( data, True) * 1. / self.n_sentences self.avg_word_length = np.mean([ len(w) for s in sentences for w in s.split(' ') if w not in stopwords.words('english') ]) print 'Parse ', len( sentences ), ' sentences, average sentence length ', self.avg_sentence_length, ', average word length ', self.avg_word_length self.sentences = sentences self.tokens = [] [self.tokens.extend(text_tokenize(sentence)) for sentence in sentences]
def stats(self, text): test_data = text stats = {} stats['flesch_reading_ease'] = textstat.flesch_reading_ease(test_data) stats['smog'] = textstat.smog_index(test_data) stats['flesch kincaid'] = textstat.flesch_kincaid_grade(test_data) stats['coleman Liau'] = textstat.coleman_liau_index(test_data) stats['automated'] = textstat.automated_readability_index(test_data) stats['dale chall'] = textstat.dale_chall_readability_score(test_data) stats['difficult'] = textstat.difficult_words(test_data) stats['linsear'] = textstat.linsear_write_formula(test_data) stats['gunning_fog'] = textstat.gunning_fog(test_data) stats['standard'] = textstat.text_standard(test_data) stats['charcount'] = textstat.char_count(test_data) stats['lexicon count'] = textstat.lexicon_count(test_data) stats['syllable count'] = textstat.syllable_count(test_data) stats['sentence count'] = textstat.sentence_count(test_data) stats['avg sentence length'] = textstat.avg_sentence_length(test_data) stats['avg_syllables_per_word'] = textstat.avg_syllables_per_word( test_data) stats['avg_letter_per_word'] = textstat.avg_letter_per_word(test_data) stats['avg_sentence_per_word'] = textstat.avg_sentence_per_word( test_data) return stats
ar_index_grades = [] ar_index_total_grade = 0 # Coleman-Liau index: goo.gl/8sE0m1 cl_index_grades = [] cl_index_total_grade = 0 # Linsear Write Formula: goo.gl/GuOZ8B lwf_grades = [] lwf_total_grade = 0 # Dale-Chall Readability Score: goo.gl/dvmXmx dcr_grades = [] dcr_total_grade = 0 num_tweets = 0 for tweet in cleanest_tweets: # skipping tweets which are not just contextbased text. if textstat.sentence_count(tweet) < 1: continue flesch_kincaid_grade = textstat.flesch_kincaid_grade(tweet) flesch_kincaid_grades.append(flesch_kincaid_grade) flesch_kincaid_total_grade += flesch_kincaid_grade gunning_fog_grade = textstat.gunning_fog(tweet) gunning_fog_grades.append(gunning_fog_grade) gunning_fog_total_grade += gunning_fog_grade smog_index_grade = textstat.smog_index(tweet) smog_index_grades.append(smog_index_grade) smog_index_total_grade += smog_index_grade ar_index_grade = textstat.automated_readability_index(tweet) ar_index_grades.append(ar_index_grade)
def updateData(self): # Full list of polarity scores self.polscore = self.sid.polarity_scores(self.text) ##### INDEX 0 IN DATA: Text Sentiment ##### # [INDEX 0] Compounded score (0.0 - 1.0) [INDEX 1] Negative connotation rating (0.0 - 1.0), # [INDEX 2] Positive connotation rating (0.0 - 1.0) [INDEX 3] Neutral connotation rating (0.0 - 1.0) self.data.append([ self.polscore['compound'], self.polscore['neg'], self.polscore['pos'], self.polscore['neu'] ]) ##### INDEX 1 IN DATA: Sentence Info ##### # [INDEX 0] Sentence count [INDEX 1] Average sentence length # [INDEX 2] Syllable count [INDEX 3] Overall word count # [INDEX 4] Character count [INDEX 5] Character count without spaces # [INDEX 6] Avg letters per word [INDEX 7] Avg syllables per word self.data.append([ textstat.sentence_count(self.text), textstat.avg_sentence_length(self.text), textstat.syllable_count(self.text), len(self.splList), textstat.char_count(self.text, False), textstat.char_count(self.text, True), textstat.avg_letter_per_word(self.text), textstat.avg_syllables_per_word(self.text) ]) ##### INDEX 2 IN DATA: Flesch Reading Ease ##### # [INDEX 0] Pure score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 100 self.freRaw = textstat.flesch_reading_ease(self.text) self.freStat = min(max(self.freRaw, 0), 100) self.data.append([ round(self.freStat, 3), self.freGrade(self.freStat), round(abs(self.freStat - 100), 2) ]) ##### INDEX 3 IN DATA: Flesch-Kincaid Grade ##### # [INDEX 0] Pure score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.fkgRaw = textstat.flesch_kincaid_grade(self.text) self.fkgStat = self.adjustScore(self.fkgRaw) self.data.append([ round(self.fkgStat, 3), self.grade(self.fkgStat), round(self.fkgStat / 0.18, 2) ]) ##### INDEX 4 IN DATA: Gunning FOG Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.fogRaw = textstat.gunning_fog(self.text) self.fogStat = self.adjustScore(self.fogRaw) self.data.append([ round(self.fogStat, 3), self.grade(self.fogStat), round(self.fogStat / 0.18, 2) ]) ##### INDEX 5 IN DATA: SMOG Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.smogRaw = textstat.smog_index(self.text) self.smogStat = self.adjustScore(self.smogRaw) self.data.append([ round(self.smogStat, 3), self.grade(self.smogStat), round(self.smogStat / 0.18, 2) ]) ##### INDEX 6 IN DATA: Automated Readability Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 14 self.ariRaw = textstat.automated_readability_index(self.text) self.ariStat = min(max(self.ariRaw, 0), 14) self.data.append([ round(self.ariStat, 3), self.ariGrade(ceil(self.ariStat)), round(self.ariStat / 0.14, 2) ]) #13 ##### INDEX 7 IN DATA: Coleman-Liau Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.cliRaw = textstat.coleman_liau_index(self.text) self.cliStat = self.adjustScore(self.cliRaw) self.data.append([ round(self.cliStat, 3), self.grade(self.cliStat), round(self.cliStat / 0.18, 2) ]) ##### INDEX 8 IN DATA: Linsear Write Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.lwiRaw = textstat.linsear_write_formula(self.text) self.lwiStat = self.adjustScore(self.lwiRaw) self.data.append([ round(self.lwiStat, 3), self.grade(self.lwiStat), round(self.lwiStat / 0.18, 2) ]) ##### INDEX 9 IN DATA: Dale-Chall Readability Score ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 10 self.dcrRaw = textstat.dale_chall_readability_score(self.text) self.dcrStat = min(max(self.dcrRaw, 0), 10) self.data.append([ round(self.dcrStat, 3), self.daleChallGrade(self.dcrStat), round(self.dcrStat / 0.1, 2) ]) ##### INDEX 10 IN DATA: Overall Score ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 20 self.txtRaw = textstat.text_standard(self.text, True) self.txtStd = min(max(self.txtRaw, 0), 20) self.txtInfo = textstat.text_standard(self.text) self.data.append([ round(self.txtStd, 3), self.txtGrade(self.txtStd, self.txtInfo), round(self.txtStd / 0.2, 2) ]) return self.data
try: cur = { "title": title, "artist": artist, "year": year, "pos": pos, "lyrics": lyrics, "tags": get_tags(artist), "sentiment": sent_analyzer.polarity_scores(lyrics_repl), "f_k_grade": ts.flesch_kincaid_grade(lyrics_repl), "flesch_index": ts.flesch_reading_ease(lyrics_repl), "fog_index": ts.gunning_fog(lyrics_repl), "difficult_words": ts.difficult_words(lyrics_repl), "num_syllables": ts.syllable_count(lyrics_repl), "num_words": ts.lexicon_count(lyrics_repl, True), "num_lines": ts.sentence_count(lyrics_repl), "num_dupes": count_dupes(lyrics) } # print cur dataset.append(cur) except Exception, e: print e except Exception, e: print "Exception occurred for " + artist + ' - ' + title print e outfile = "years/" + str(year) + '.txt' dir = os.path.dirname(outfile) if not os.path.exists(dir): os.makedirs(dir)
#main script if __name__ == '__main__': print "TextStat Comparison Script" print "--------------------------" #read in text from the command line #This needs to be fixed to deal/escape special characters textToCheck = raw_input("Please enter the text you would like to analyse: ") #read in text from a file- but what format? print "\n\n" print "Results" print "==============================================" print "==============================================\n" print "Syllable Count: " + str(textstat.syllable_count(textToCheck)) print "Lexicon Count: " + str(textstat.lexicon_count(textToCheck)) #TRUE is default and removes punctuation before counting print "Sentence Count: " + str(textstat.sentence_count(textToCheck)) print "Flesch Reading Ease formula: " + str(textstat.flesch_reading_ease(textToCheck)) print "Flesch-Kincaid Grade Level: " + str(textstat.flesch_kincaid_grade(textToCheck)) print "Fog Scale (Gunning FOG Formula): " + str(textstat.gunning_fog(textToCheck)) print "SMOG Index: " + str(textstat.smog_index(textToCheck)) print "Automated Readability Index: " + str(textstat.automated_readability_index(textToCheck)) print "Coleman-Liau Index: " + str(textstat.coleman_liau_index(textToCheck)) print "Linsear Write Formula: " + str(textstat.linsear_write_formula(textToCheck)) print "Dale-Chall Readability Score: " + str(textstat.dale_chall_readability_score(textToCheck)) print "--------------------------------------------------------------" print "Readability Consensus based upon all the above tests: " + str(textstat.text_standard(textToCheck)) print "\n\n"
def get_textstats(text): return textstat.sentence_count(text), textstat.automated_readability_index(text), textstat.flesch_reading_ease(text)
print( "-------------------------Text Statistic-----------------------------------" ) print("Returns the number of syllables present in the given text.") # print(textstat.syllable_count(test_data, lang='en_US')) num_syllables = textstat.syllable_count(test_data, lang='en_US') print(num_syllables) print( "Calculates the number of words present in the text - punctuation removed" ) # print(textstat.lexicon_count(test_data, removepunct=True)) num_words = textstat.lexicon_count(test_data, removepunct=True) print(num_words) print("Returns the number of sentences present in the given text.") # print(textstat.sentence_count(test_data)) num_sentences = textstat.sentence_count(test_data) print(num_sentences) print("difficult words") # print(textstat.difficult_words(test_data)) num_difficult_words = textstat.difficult_words(test_data) print(num_difficult_words) print( "-------------------------Difficulty------------------------------" ) print("The Flesch Reading Ease Score") # print(textstat.flesch_reading_ease(test_data)) difficulty_score = textstat.flesch_reading_ease(test_data) print(difficulty_score) if 0 <= difficulty_score < 30:
def gettingFeatures(text): text = text.lower() #words / syllables / sentences count wordCount = len(text.split()) syllables = textstat.syllable_count(text) sentences = textstat.sentence_count(text) try: #ReadabilityScore readabilityScore = 206.835 - 1.015 * (wordCount / sentences) - 84.6 * ( syllables / wordCount) #ReadabilityGrade ReadabilityGrade = 0.39 * (wordCount / sentences) + 11.8 * ( syllables / wordCount) - 15.59 except: readabilityScore = 0 ReadabilityGrade = 0 #Direction Count #private String[] direction = {"here", "there", "over there", "beyond", "nearly", "opposite", "under", "above", "to the left", "to the right", "in the distance"}; DiractionCount = 0 DiractionCount = text.count("here") + text.count("there") + text.count( "over there") + text.count("beyond") + text.count( "nearly") + text.count("opposite") + text.count( "under") + text.count("to the left") + text.count( "to the right") + text.count("in the distance") #Exemplify count #private String[] exemplify = {"chiefly", "especially", "for instance", "in particular", "markedly", "namely", "particularly", "including", "specifically", "such as"}; Exemplify = 0 Exemplify = text.count("chiefly") + text.count("especially") + text.count( "for instance") + text.count("in particular") + text.count( "markedly") + text.count("namely") + text.count( "particularly") + text.count("incluiding") + text.count( "specifically") + text.count("such as") #Analytical thinking #Analytic = 0 #LIWC Analysis #Aunthenticity #Authentic = 0 #LIWC Analysis #Emotional tone #Tone = 0 #LIWC Analysis try: #words per sentence (average) WPS = 0 parts = [len(l.split()) for l in re.split(r'[?!.]', text) if l.strip()] WPS = sum(parts) / len(parts) #number of words per sentence except: WPS = 0 #Six letter words Sixltr = 0 words = text.split() letter_count_per_word = {w: len(w) for w in words} for x in letter_count_per_word.values(): if x >= 6: Sixltr = Sixltr + 1 #Function words function = 0 #Pronouns pronoun = 0 text_tokens = word_tokenize(text) result = nltk.pos_tag(text_tokens) pronoun = len([(x, y) for x, y in result if y == "PRP" or y == "PRP$"]) #Personal pronouns ppron = 0 ppron = len([(x, y) for x, y in result if y == "PRP"]) #I i = 0 i = text.count("i") #You you = 0 you = text.count("you") #Impersonal pronoun "one" / "it" ipron = 0 ipron = text.count("one") + text.count("it") #Prepositions prep = 0 prep = len([(x, y) for x, y in result if y == "IN"]) #Auxiliary verbs do/be/have auxverb = 0 auxverb = text.count("do") + text.count("does") + text.count( "don´t") + text.count("doesn´t") + text.count("has") + text.count( "have") + text.count("hasn´t") + text.count( "haven´t") + text.count("am") + text.count("are") + text.count( "is") + text.count("´m") + text.count("´re") + text.count( "´s") #Negations negate = 0 negate = text.count("not") #Count interrogatives #interrog = 0 #LICW Analysis #Count numbers number = 0 prep = len([(x, y) for x, y in result if y == "CD"]) #Cognitive processes #cogproc = 0 #LIWC Analysis #Cause relationships #cause = 0 #LIWC Analysis #Discrepencies #discrep = 0 #LIWC Analysis #Tenant #tentat = 0 #LIWC Analysis #Differtiation #differ = 0 #LIWC Analysis #Perceptual processes #percept = 0 #LIWC Analysis #Verbs past focus VBD VBN focuspast = 0 focuspast = len([(x, y) for x, y in result if y == "VBN" or y == "VBD"]) #Verbs present focus VB VBP VBZ VBG focuspresent = 0 focuspast = len([(x, y) for x, y in result if y == "VB" or y == "VBP" or y == "VBZ" or y == "VBG"]) #net speak #netspeak = 0 #LIWC Analysis #Assent #assent = 0 #LIWC Analysis #Non fluencies #nonflu = 0 #LIWC Analysis #Count all punctuation AllPunc = 0 punctuation = "!#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~" cd = {c: val for c, val in ct.Counter(text).items() if c in punctuation} for x in cd.values(): AllPunc = AllPunc + x #number of commas Comma = 0 Comma = text.count(",") #number of question marks QMark = 0 QMark = text.count("?") #return numpy.array([wordCount,readabilityScore,ReadabilityGrade,DiractionCount,Analytic,Authentic,Tone,WPS,Sixltr,function,pronoun,ppron,i,you,ipron,prep,auxverb,negate,interrog,number,cogproc,cause,discrep,tentat,differ,percept,focuspast,focuspresent,netspeak,assent,nonflu,AllPunc,Comma,QMark,Exemplify]) return [ wordCount, readabilityScore, ReadabilityGrade, DiractionCount, WPS, Sixltr, pronoun, ppron, i, you, ipron, prep, auxverb, negate, number, focuspast, focuspresent, AllPunc, Comma, QMark, Exemplify ]
rt = response raw_html = response.read() g = goose.Goose() a = g.extract(raw_html=raw_html) htext = a.cleaned_text opinion = TextBlob(htext) pol = opinion.sentiment.polarity sub = opinion.sentiment.subjectivity rt = requests.get(qqll).elapsed.total_seconds() kw = str(keywords(htext, lemmatize=True)) kw = kw.replace('\r', ' ').replace('\n', ' ') keyw = ' '.join(kw.split()[:3]) sbody = htext.replace(',', '') fkg = textstat.flesch_kincaid_grade(htext) wc = textstat.lexicon_count(htext) sc = textstat.sentence_count(htext) fre = textstat.flesch_reading_ease(htext) sinsite = [ 'response time', 'subjective', 'polarity', 'fgrade', 'fscore', 'words.counts', 'sentence.count', 'keywords', 'title', 'link', 'text' ] wr.writerow(sinsite) insite = [rt, sub, pol, fkg, fre, wc, sc, keyw, a.title, qqll] wr.writerow(insite) rec = re.compile(r"https?://(www\.)?") zz = rec.sub('', qqll).strip().strip('/') with open('rowTwittersite.csv', 'w') as tsout: wr = csv.writer(tsout, quoting=csv.QUOTE_ALL) tnslist = [
from textstat.textstat import textstat import re import time import csv start_time = time.time() def load_file(file_path): comments = [] with open(file_path, 'r') as file_reader: reader = csv.reader(file_reader, delimiter=',', quotechar='"') reader.next() for row in reader: text = re.sub('([A-Za-z]+:\/\/[A-Za-z0-9]+\.[A-Za-z0-9]+[^\s-]*)|([A-Za-z]+\.[A-Za-z0-9]+\.[A-Za-z0-9]+[^\s-]*)', '', row[0]) #url get rid text = re.sub('\s\s+', ' ', text) comments.append(text) return comments docs = ["AskReddit2008.csv", "AskReddit2009.csv", "AskReddit2010.csv", "AskReddit2011.csv", "AskReddit2012.csv", "AskReddit2013.csv", "AskReddit2014.csv"] for doc_path in docs: documents = load_file(doc_path) levels = [textstat.flesch_reading_ease(comment) for comment in documents if textstat.sentence_count(comment) != 0] print "reading level for " + doc_path print sum(levels)/len(levels) elapsed_time = time.time() - start_time print "elapsed time in seconds: " + str(elapsed_time)
if __name__ == '__main__': # prompt user for file and open it user_input = input("Enter file name to open: ") input_string = open(user_input).read() user_input = input("Enter file name to write to: ") # declare/initialize lists copy_string = input_string.split() words_with_synonyms = [] the_synonyms = [] # get number of syllables, words, sentences, and FK score for the file num_syllables = textstat.syllable_count(input_string) num_words = textstat.lexicon_count(input_string) num_sentences = textstat.sentence_count(input_string) fk_score = 206.835 - float(1.105 * (num_words / num_sentences)) - float( 84.6 * (num_syllables / num_words)) # print number of syllables, words, and sentences in the file print("\nNumber of syllables: ", num_syllables) print("Number of words: ", num_words) print("Number of sentences: ", num_sentences) output = synonym_replacement(input_string, copy_string) #output = remove_adjective(output, copy_string) initial_grade = check_reading_level(input_string) new_grade = check_reading_level(output) new_num_syllables = textstat.syllable_count(output) new_num_words = textstat.lexicon_count(output)
#!/bin/python import sys, string, os from textstat.textstat import textstat inputfile = '' test_data = "" script_name = sys.argv[0] inputfile = sys.argv[1] with open(inputfile) as myfile: test_data="".join(line.rstrip() for line in myfile) var1 = str(textstat.flesch_reading_ease(test_data)) var2 = str(textstat.smog_index(test_data)) var3 = str(textstat.flesch_kincaid_grade(test_data)) var4 = str(textstat.coleman_liau_index(test_data)) var5 = str(textstat.automated_readability_index(test_data)) var6 = str(textstat.dale_chall_readability_score(test_data)) var7 = str(textstat.difficult_words(test_data)) var8 = str(textstat.linsear_write_formula(test_data)) var9 = str(textstat.gunning_fog(test_data)) var10 = str(textstat.readability_consensus(test_data)) var11 = str(textstat.syllable_count(test_data)) var12 = str(textstat.lexicon_count(test_data, 1)) var13 = str(textstat.sentence_count(test_data)) print(var1 + ',' + var2 + ',' + var3 + ',' + var4 + ',' + var5 + ',' + var6 + ',' + var7 + ',' + var8 + ',' + var9 + ',' + var10 + ',' + var11 + ',' + var12 + ',' + var13)
# test_data = "georges a hotel in saint john state rica. save with expedia's price guarantee." # test_data = "suwanee oceanside princess. no best costs. great booking book a hotel in ibiza. " \ # "great booking book a hotel in ibiza." # test_data = "refund hotel in ewr of george area area nearby. 3-star belfast hotel in" test_data = "great rates. book at western blue casino hotel, bangkok. no reservation costs. great" print( "-------------------------Text Statistic-----------------------------------" ) print("Returns the number of syllables present in the given text.") print(textstat.syllable_count(test_data, lang='en_US')) print( "Calculates the number of words present in the text - punctuation removed") print(textstat.lexicon_count(test_data, removepunct=True)) print("Returns the number of sentences present in the given text.") print(textstat.sentence_count(test_data)) print("difficult words") print(textstat.difficult_words(test_data)) print( "-------------------------Readability Formula------------------------------" ) print("The Flesch Reading Ease Score") print(textstat.flesch_reading_ease(test_data)) print("The SMOG Index") print("Texts of fewer than 30 sentences are statistically invalid, " "because the SMOG formula was normed on 30-sentence samples.") print("textstat requires atleast 3 sentences for a result.") print(textstat.smog_index(test_data)) print("The Flesch-Kincaid Grade") print(textstat.flesch_kincaid_grade(test_data)) print("The Coleman-Liau Index")
def calculate_number_of_sentences(review): review = str(review) if len(review) > 0: return math.sqrt(math.sqrt(textstat.sentence_count(review))) else: return 0
remove_digits = str.maketrans('', '', digits) strip_nums = str.maketrans('', '', digits) df['text'] = df['text'].apply(lambda x: x.translate(remove_digits)) # Print pro/con text of one review. #df.loc[6,'pros'] #df.loc[6,'cons'] #df.loc[7,'text'] # Create features for grade level, reading ease, word counts, sentence count, and paragraphs. # Source: https://pypi.python.org/pypi/textstat/ # Note: \r = paragraph break. \n = white space. df['read_ease_grade'] = df['text'].apply( lambda x: textstat.flesch_kincaid_grade(x)) df['sentence_count'] = df['text'].apply(lambda x: textstat.sentence_count(x)) df['word_count'] = df['text'].apply(lambda x: textstat.lexicon_count(x)) df['word_count_squared'] = (df['word_count'])**2 df['paragraph'] = df['pros'].apply(lambda x: x.count('\r')) + df['cons'].apply( lambda x: x.count('\r')) df['text_ratio'] = (df['textLengthPro'] - df['textLengthCon']) / ( (df['textLengthPro'] + df['textLengthCon'])) ################################################################ #### Stop words, tokenize, stemming. ################################################################ #### Tokenize text. tokenizer = RegexpTokenizer(r'\w+') df['tokens'] = df['text'].apply(lambda x: tokenizer.tokenize(x))
def passes_sentencify(axes, fig, config_value): """ lint rule for sentencify """ return textstat.sentence_count(axes.get_title()) >= 1