def transform(self, input_df: pd.DataFrame) -> coo_matrix: """ It computes and returns the linguistic features from the input DF. The DF must include the following attributes in its columns: Q_TEXT, Q_ID :param input_df: :return: """ if Q_TEXT not in input_df.columns: raise ValueError("Q_TEXT should be in input_df.columns") if Q_ID not in input_df.columns: raise ValueError("Q_ID should be in input_df.columns") correct_ans_text_dict = gen_correct_answers_dict(input_df) wrong_ans_text_dict = gen_wrong_answers_dict(input_df) df = pd.DataFrame() df['lexicon_count_question'] = input_df.apply( lambda r: textstat.lexicon_count(r[Q_TEXT]), axis=1) df['lexicon_count_correct_choices'] = input_df.apply( lambda r: np.mean([ textstat.lexicon_count(x) for x in correct_ans_text_dict[r[Q_ID]] ]), axis=1) df['lexicon_count_wrong_choices'] = input_df.apply(lambda r: np.mean( [textstat.lexicon_count(x) for x in wrong_ans_text_dict[r[Q_ID]]]), axis=1) df['sentence_count_question'] = input_df.apply( lambda r: textstat.sentence_count(r[Q_TEXT]), axis=1) df['sentence_count_correct_choices'] = input_df.apply( lambda r: np.mean([ textstat.sentence_count(x) for x in correct_ans_text_dict[r[Q_ID]] ]), axis=1) df['sentence_count_wrong_choices'] = input_df.apply(lambda r: np.mean( [textstat.sentence_count(x) for x in wrong_ans_text_dict[r[Q_ID]]]), axis=1) df['avg_word_len_question'] = input_df.apply( lambda r: np.mean([len(x) for x in r[Q_TEXT].split(' ')]), axis=1) df['ratio_len_question_correct_choices'] = df.apply( lambda r: (1 + r['lexicon_count_question']) / (1 + r['lexicon_count_correct_choices']), axis=1) df['ratio_len_question_wrong_choices'] = df.apply( lambda r: (1 + r['lexicon_count_question']) / (1 + r['lexicon_count_wrong_choices']), axis=1) return coo_matrix(df.values)
def check_difficulty(self): text = self.textoutput #0-30 = college #50-60 = high school #60+ = middle school/elementary school try: grade_level = textstat.text_standard(text) reading_ease = textstat.flesch_reading_ease(text) #requires chart sentence_count = textstat.sentence_count(text) difficult_words = self.get_difficult_words(text) replacement_words = self.get_replacement_words(difficult_words) output = "Grade Level of Input Text: " + grade_level + "\n" #output = output + "Ease of Reading*: " + str(reading_ease) + "\n" output = output + "Sentence Count: " + str(sentence_count) + "\n" output = output + "Difficult Words Found: " + str( len(difficult_words)) + "\n" output = output + "Possible Replacements: " + "\n" for dw in replacement_words: output = output + dw + " -> " for word in replacement_words[dw]: output = output + word + ", " output = output + "\n" self.difficultyReport = output except: self.difficultyReport = "Error determining Difficulties"
def statistics(self, text): self.asl = textstat.avg_sentence_length(text) self.avg_sentence_per_word = textstat.avg_sentence_per_word(text) self.avg_syllables_per_word = textstat.avg_syllables_per_word(text) self.difficult_words = textstat.difficult_words(text) self.lexicon_count = textstat.lexicon_count(text) self.polysyllable_count = textstat.polysyllabcount(text) self.sentence_count = textstat.sentence_count(text)
def avg_sentence_len() -> List: """return sentence length in each policy """ count = [] for text in policies['Policy']: w_count = len(text.split()) count.append(int(w_count / textstat.sentence_count(text))) return count
def do_datas(): # logging.info('do_datas') ########### Save text statistics ##### 1. nw 2. nvocab 3. nsyllable 4.nsentence 5. tone 6. readability ## 1. nw nw.append(len(words)) ## 2. nvocab nvocab.append(len(vocab)) ## 3. syllable n = textstat.syllable_count(contents) nsyllable.append(n) ## 4. sentence n = textstat.sentence_count(contents) nsentence.append(n) ## 5. tone ### LM dictionary n_neg_lm.append(count_occurrence(words, lm_neg)) n_pos_lm.append(count_occurrence(words, lm_pos)) n_uctt_lm.append(count_occurrence(words, lm_uctt)) n_lit_lm.append(count_occurrence(words, lm_lit)) n_cstr_lm.append(count_occurrence(words, lm_cstr)) n_modal1_lm.append(count_occurrence(words, lm_modal1)) n_modal2_lm.append(count_occurrence(words, lm_modal2)) n_modal3_lm.append(count_occurrence(words, lm_modal3)) n_negation_lm.append(count_negation(words, lm_pos, gt_negation)) ### General Inquirer dictionary n_neg_gi.append(count_occurrence(words, gi_neg)) n_pos_gi.append(count_occurrence(words, gi_pos)) n_negation_gi.append(count_negation(words, gi_pos, gt_negation)) ### Henry dictionary n_neg_hr.append(count_occurrence(words, hr_neg)) n_pos_hr.append(count_occurrence(words, hr_pos)) n_negation_hr.append(count_negation(words, gi_pos, gt_negation)) ## 4. readability fre_i = textstat.flesch_reading_ease(contents) if fre_i > 100: fre_i = 100 if fre_i < 0: fre_i = float('NaN') fre.append(fre_i) fkg_i = textstat.flesch_kincaid_grade(contents) if fkg_i < 0: fkg_i = float('NaN') fkg.append(fkg_i) # RIX cl_i = textstat.coleman_liau_index(contents) if cl_i < 0: cl_i = float('NaN') cl.append(cl_i) f = textstat.gunning_fog(contents) fog.append(f) f = textstat.automated_readability_index(contents) ari.append(f) f = textstat.smog_index(contents) smog.append(f)
def display_scores(name: str, text: str) -> None: nameLabel = Label(root, text=name+": ", font=12, pady=10) nameLabel.pack() fleschEaseScore = textstat.flesch_reading_ease(text) fESAll = fleschIndex.fleschscore() fESAll.sort() print(fESAll) percentile = 100 for i in range(len(fESAll)): if fleschEaseScore < fESAll[i]: percentile = int(100 * (i / len(fESAll))) break fESContent = "Flesch Reading Ease Score: {},\n which is in the {}th percentile.".format(fleschEaseScore, percentile) fESLbl = Label(root, text=fESContent, font=12, pady=10) fESLbl.pack() flKinScore = textstat.flesch_kincaid_grade(text) fKSAll = fleschIndex.fleschkincaid() fKSAll.sort(reverse=True) print(fKSAll) percentile = 100 for i in range(len(fKSAll)): if flKinScore > fKSAll[i]: percentile = int(100 * (i / len(fKSAll))) break fKSContent = "Flesch-Kincaid Grade: {},\n which is in the {}th percentile.".format(flKinScore, percentile) fKSLbl = Label(root, text=fKSContent, font=12, pady=10) fKSLbl.pack() avgSentenceLen = int(len(text.split()) / textstat.sentence_count(text)) aSLAll = fleschIndex.avg_sentence_len() aSLAll.sort(reverse=True) print(aSLAll) percentile = 100 for i in range(len(aSLAll)): if avgSentenceLen > aSLAll[i]: percentile = int(100 * (i / len(aSLAll))) break aSLContent = "Average Sentence Length is: {},\n which is in the {}th percentile.".format(avgSentenceLen, percentile) aSLLbl = Label(root, text=aSLContent, font=12, pady=10) aSLLbl.pack() predictedScore = predict_score(text) predictedScoreContent = "Regression Model Predicted Score is: {}/5.".format("{0:.2f}".format(predictedScore)) pSLbl = Label(root, text=predictedScoreContent, font=15, pady=15, fg="blue") pSLbl.pack()
def lisibilty(text): f_lis = ([ textstat.syllable_count(str(text), lang='en_arabic'), textstat.lexicon_count(str(text), removepunct=True), textstat.sentence_count(str(text)), textstat.flesch_reading_ease(str(text)), textstat.flesch_kincaid_grade(str(text)), textstat.gunning_fog(str(text)), textstat.smog_index(str(text)), textstat.automated_readability_index(str(text)), textstat.coleman_liau_index(str(text)), textstat.linsear_write_formula(str(text)), textstat.dale_chall_readability_score(str(text)) ]) return f_lis
def get_desc_data(string): ''' Input: book description string Output: returns desc_len, num_unique_words, avg_word_len ''' #Data before text processing desc_semantic = get_semantic(string) syl_count = syllable_count(string) lex_count = lexicon_count(string) sent_count = sentence_count(string) flesch = flesch_reading_ease(string) #Data after text processing string = text_preprocess(string) word_cnt = word_count(string) description_len = desc_len(string) number_unique_words = num_unique_words(string) average_word_len = avg_word_len(string) return desc_semantic, word_cnt, description_len, number_unique_words, \ average_word_len, syl_count, lex_count, sent_count, flesch
def process_datum(datum): # Remove tags soup = BeautifulSoup(datum["Content"], features="html.parser") clean_soup = BeautifulSoup(datum["Content"], features="html.parser") for elm in clean_soup(["code"]): elm.extract() body_text = clean_soup.get_text() pos_tags = pos_tag(word_tokenize(body_text)) pos_counts = Counter([tag for word, tag in pos_tags]) # preterm_counts = result = {} result['TEXT'] = body_text result['CT1'] = lexicon_count(body_text) result['CT2'] = sentence_count(body_text) for tag in POS_TAGS: result['CT3.' + tag] = pos_counts[tag] # for preterm in PRETERMINALS: # results['CT4.' + preterm] = result['CN1'] = len(soup.find_all("code", href=True)) +\ len(soup.find_all("img", href=True)) +\ len(soup.findAll("span", {"class": "math-container"})) result['CN2'] = len(soup.find_all("a", href=True)) result['U1.SUM'] = datum['U1.SUM'] result['U1.1'] = datum['U1.1'] result['U1.2'] = datum['U1.2'] result['U2'] = datum['U2'] result['Y1'] = datum['Y1'] result['Y2'] = datum['Y2'] result['T'] = datum['T'] result['S'] = datum['S'] result['D'] = datum['D'] return result
def convert_txt_file(window, inputFilename, inputDir, outputDir, openOutputFiles, excludeStopWords=True, lemmatizeWords=True): filesToOpen = [] outputFilename = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.txt', 'corpus', 'lemma_stw') filesToOpen.append(outputFilename) inputDocs = IO_files_util.getFileList(inputFilename, inputDir, fileType='.txt') Ndocs = str(len(inputDocs)) IO_user_interface_util.timed_alert( GUI_util.window, 2000, 'Analysis start', 'Started running txt conversion (lemmatization & stopwords) at', True) with open(outputFilename, 'w', encoding='utf-8', errors='ignore', newline='') as outfile: #print("Number of corpus text documents: ",Ndocs) #currentLine.append([Ndocs]) index = 0 for doc in inputDocs: head, tail = os.path.split(doc) index = index + 1 # currentLine.append([index]) print("Processing file " + str(index) + "/" + str(Ndocs) + " " + tail) fullText = (open(doc, "r", encoding="utf-8", errors="ignore").read()) Nsentences = str(textstat.sentence_count(fullText)) #print('TOTAL number of sentences: ',Nsentences) Nwords = str(textstat.lexicon_count(fullText, removepunct=True)) #print('TOTAL number of words: ',Nwords) Nsyllables = textstat.syllable_count(fullText, lang='en_US') #print('TOTAL number of Syllables: ',Nsyllables) # words = fullText.split() words = nltk.word_tokenize(fullText) if excludeStopWords: words = excludeStopWords_list(words) if lemmatizeWords: lemmatizer = WordNetLemmatizer() text_vocab = set( lemmatizer.lemmatize(w.lower()) for w in fullText.split(" ") if w.isalpha()) words = set( lemmatizing(w.lower()) for w in words if w.isalpha()) # fullText.split(" ") if w.isalpha())
file_name_xl = './' + file[:-4] + ' - Clause Analysis.xls' print('working on:', file) #Read in CSV df = pd.read_csv('./Data/' + file, header=0) text_data = np.array(df.iloc[:, 3]) pod_id = np.array(df.iloc[:, 0]) speaker_id = np.array(df.iloc[:, 2]) sentence_complexity = [] counted_clauses = [] instance_ids = [] for i, instance in enumerate(text_data): # calculate sentence complexity no_clauses = count_clauses(instance) counted_clauses.append(no_clauses) sentence_complexity.append(no_clauses/textstat.sentence_count(instance)) inst_id = str(pod_id[i]) + '_' + str(speaker_id[i]) instance_ids.append(inst_id) # make dictionary of results clause_analysis = { 'instance' : instance_ids, 'number of clauses' : counted_clauses, 'sentence complexity' : sentence_complexity } # export to excel sheet new_df = pd.DataFrame(clause_analysis, columns=['instance', 'number of clauses', 'sentence complexity']) new_df.to_excel(file_name_xl, index=False, sheet_name='Clause')
def download(request): global tweetsList response = HttpResponse(content_type='application/x-download') response['Content-Disposition'] = 'attachment; filename="tweets.csv"' #set headers of csv fieldnames = ['datetime', 'last updated', 'original username', 'original screen name', 'original user location', 'original user verified', 'retweet', 'retweeter username', 'retweeter screen name', 'retweeter location', 'retweeter verified', 'text', 'comment', # 'hashtags', 'urls', '#retweets','#favorites', '#retweets of retweet', 'hashtags', 'urls', '#retweets', '#favorites', '#favorites of retweet', 'original syllable count', 'original lexicon count', 'original sentence count', 'original flesch reading ease score', 'original flesch-kincaid grade level', 'original fog scale', 'original smog index', 'original automated readability index', 'original coleman-liau index', 'original linsear write level', 'original dale-chall readability score', 'original difficult words', 'original readability consensus', 'original neg sentiment', 'original neu sentiment', 'original pos sentiment', 'original overall sentiment', 'comment syllable count', 'comment lexicon count', 'comment sentence count', 'comment flesch reading ease score', 'comment flesch-kincaid grade level', 'comment fog scale', 'comment smog index', 'comment automated readability index', 'comment coleman-liau index', 'comment linsear write level', 'comment dale-chall readability score', 'comment difficult words', 'comment readability consensus', 'comment neg sentiment', 'comment neu sentiment', 'comment pos sentiment', 'comment overall sentiment', 'combined syllable count', 'combined lexicon count', 'combined sentence count', 'combined flesch reading ease score', 'combined flesch-kincaid grade level', 'combined fog scale', 'combined smog index', 'combined automated readability index', 'combined coleman-liau index', 'combined linsear write level', 'combined dale-chall readability score', 'combined difficult words', 'combined readability consensus', 'combined neg sentiment', 'combined neu sentiment', 'combined pos sentiment', 'combined overall sentiment', 'twitter users query', 'twitter excluded users query', 'twitter hashtags query', 'twitter keywords query', 'twitter from date query', 'twitter to date query'] writer = csv.writer(response, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(fieldnames) for tweet in tweetsList: #combine hashtags of tweet into string separated by commas hashtagString = "" tweetHashtags = HashtagLog.objects.filter(tweet__id=tweet.id) for i in range(len(tweetHashtags)): if i == 0: hashtagString += tweetHashtags[i].hashtag.hashtagText else: hashtagString += ", " + tweetHashtags[i].hashtag.hashtagText #combine urls of tweet into string separated by commas urlString = "" tweetUrls = UrlLog.objects.filter(tweet__id=tweet.id) for i in range(len(tweetUrls)): if i == 0: urlString += tweetUrls[i].url.urlText else: urlString += ", " + tweetUrls[i].url.urlText #display yes or no in verified column for original user if tweet.originalUser.isVerified: originalVerifiedString = "yes" else: originalVerifiedString = "no" #if not a retweet, new user fields should be empty newUsername = None newScreenName = None newLocation = None newVerifiedString = None #if retweet: #display yes or no in verified column for new user if tweet.newUser: if tweet.newUser.isVerified: newVerifiedString = "yes" else: newVerifiedString = "no" #set retweet fields newUsername = tweet.newUser.username newScreenName = tweet.newUser.screenName newLocation = tweet.newUser.location #display yes or no in retweet column if tweet.isRetweet: isRetweetString = "yes" else: isRetweetString = "no" #get sentiment scores of original text sid_obj = SentimentIntensityAnalyzer() sentiment_dict_original = sid_obj.polarity_scores(tweet.originalText) #combine comment text and original tezt and get sentiment scores for the combination commentText = "" if tweet.commentText: commentText = tweet.commentText sentiment_dict_combined = sid_obj.polarity_scores(tweet.originalText + commentText) #intialize all comment word processing to empty strings in case there is no comment text cSyllableCount = "" cLexiconCount = "" cSentenceCount = "" cFleschReadingEase = "" cFleschKincaidGrade = "" cGunningFog = "" cSmogIndex = "" cAutomatedReadabilityIndex = "" cColemanLiauIndex = "" cLinsearWriteFormula = "" cDaleChallReadabilityScore = "" cDifficultWords = "" cTextStandard = "" #if there is comment text, get language processing stats for comment text if tweet.commentText != None: cSyllableCount = textstat.syllable_count(tweet.commentText, lang='en_US') cLexiconCount = textstat.lexicon_count(tweet.commentText, removepunct=True) cSentenceCount = textstat.sentence_count(tweet.commentText) cFleschReadingEase = textstat.flesch_reading_ease(tweet.commentText) cFleschKincaidGrade = textstat.flesch_kincaid_grade(tweet.commentText) cGunningFog = textstat.gunning_fog(tweet.commentText) cSmogIndex = textstat.smog_index(tweet.commentText) cAutomatedReadabilityIndex = textstat.automated_readability_index(tweet.commentText) cColemanLiauIndex = textstat.coleman_liau_index(tweet.commentText) cLinsearWriteFormula = textstat.linsear_write_formula(tweet.commentText) cDaleChallReadabilityScore = textstat.dale_chall_readability_score(tweet.commentText) cDifficultWords = textstat.difficult_words(tweet.commentText) cTextStandard = textstat.text_standard(tweet.commentText, float_output=False) #get sentiment scores for comment text cNegSent = "" cNeuSent = "" cPosSent = "" cCompoundSent = "" if tweet.commentText: sentiment_dict_comment = sid_obj.polarity_scores(tweet.commentText) cNegSent = sentiment_dict_comment['neg'] cNeuSent = sentiment_dict_comment['neu'] cPosSent = sentiment_dict_comment['pos'] cCompoundSent = sentiment_dict_comment['compound'] #write all information about the tweet, and its language processing stats to row in csv writer.writerow( [tweet.createdAt, tweet.lastUpdated, tweet.originalUser.username, tweet.originalUser.screenName, tweet.originalUser.location, originalVerifiedString, isRetweetString, newUsername, newScreenName, newLocation, newVerifiedString, tweet.originalText, tweet.commentText, hashtagString, urlString, tweet.numRetweetsOriginal, # tweet.numFavoritesOriginal, tweet.numRetweetsNew, tweet.numFavoritesNew, tweet.numFavoritesOriginal, tweet.numFavoritesNew, textstat.syllable_count(tweet.originalText, lang='en_US'), textstat.lexicon_count(tweet.originalText, removepunct=True), textstat.sentence_count(tweet.originalText), textstat.flesch_reading_ease(tweet.originalText), textstat.flesch_kincaid_grade(tweet.originalText), textstat.gunning_fog(tweet.originalText), textstat.smog_index(tweet.originalText), textstat.automated_readability_index(tweet.originalText), textstat.coleman_liau_index(tweet.originalText), textstat.linsear_write_formula(tweet.originalText), textstat.dale_chall_readability_score(tweet.originalText), textstat.difficult_words(tweet.originalText), textstat.text_standard(tweet.originalText, float_output=False), sentiment_dict_original['neg'], sentiment_dict_original['neu'], sentiment_dict_original['pos'], sentiment_dict_original['compound'], cSyllableCount, cLexiconCount, cSentenceCount, cFleschReadingEase, cFleschKincaidGrade, cGunningFog, cSmogIndex, cAutomatedReadabilityIndex, cColemanLiauIndex, cLinsearWriteFormula, cDaleChallReadabilityScore, cDifficultWords, cTextStandard, cNegSent, cNeuSent, cPosSent, cCompoundSent, textstat.syllable_count(tweet.originalText + commentText, lang='en_US'), textstat.lexicon_count(tweet.originalText + commentText, removepunct=True), textstat.sentence_count(tweet.originalText + commentText), textstat.flesch_reading_ease(tweet.originalText + commentText), textstat.flesch_kincaid_grade(tweet.originalText + commentText), textstat.gunning_fog(tweet.originalText + commentText), textstat.smog_index(tweet.originalText + commentText), textstat.automated_readability_index(tweet.originalText + commentText), textstat.coleman_liau_index(tweet.originalText + commentText), textstat.linsear_write_formula(tweet.originalText + commentText), textstat.dale_chall_readability_score(tweet.originalText + commentText), textstat.difficult_words(tweet.originalText + commentText), textstat.text_standard(tweet.originalText + commentText, float_output=False), sentiment_dict_combined['neg'], sentiment_dict_combined['neu'], sentiment_dict_combined['pos'], sentiment_dict_combined['compound'], tweet.twitterQueryUsers, tweet.twitterQueryNotUsers, tweet.twitterQueryHashtags, tweet.twitterQueryKeywords, tweet.twitterQueryFromDate, tweet.twitterQueryToDate] ) return response
def sentence_count(cleanedtext): blob = TextBlob(cleanedtext) split_text=blob.sentences No_Of_Sentences=len(split_text) #Initially used this but split_text tends to overcount return textstat.sentence_count(cleanedtext)
def process_data(contents): ############ Word Tokenization ## Raw tokens: including punctuations, numbers etc. tokens = word_tokenize(contents) ## Convert all words into small cases ## Keep tokens that purely consist of alphabetic characters only ## Delete single-character words except for 'I' words = [ w.lower() for w in tokens if w.isalpha() and len(w) > 1 or w == 'i' ] print( f'contents length:{len(contents)},tokens:{len(tokens)},words:{len(words)}' ) ########### Delete words with lenth smaller than 1% and largr than 99% of the document # wordlen99 = np.quantile([len(w) for w in words], 0.99) # wordlen1 = np.quantile([len(w) for w in words], 0.01) # words = [w for w in words if len(w)<wordlen99 and len(w)>wordlen1] vocab = sorted(set(words)) ########### Save text statistics ##### 1. nw 2. nvocab 3. nsyllable 4.nsentence 5. tone 6. readability ## 1. nw nw.append(len(words)) ## 2. nvocab nvocab.append(len(vocab)) ## 3. syllable nsyllable.append(textstat.syllable_count(contents)) ## 4. sentence nsentence.append(textstat.sentence_count(contents)) ## 5. tone ### LM dictionary # n3=count_occurrence3(words, lm_neg) # n1=count_occurrence(words, lm_neg) # # n2=count_occurrence2(set(words), set(lm_neg)) # if n1!=n3: # print('EEEEEEEEEEEEEEEEEEEEEEEror') # exit() # n_neg_lm.append(n1) n_neg_lm.append(count_occurrence3(words, lm_neg)) n_pos_lm.append(count_occurrence3(words, lm_pos)) n_uctt_lm.append(count_occurrence3(words, lm_uctt)) n_lit_lm.append(count_occurrence3(words, lm_lit)) # n_cstr_lm.append(count_occurrence(words, lm_cstr)) # n_modal1_lm.append(count_occurrence(words, lm_modal1)) # n_modal2_lm.append(count_occurrence(words, lm_modal2)) # n_modal3_lm.append(count_occurrence(words, lm_modal3)) # n_negation_lm.append(count_negation(words, lm_pos, gt_negation)) # n1=count_negation(words, lm_pos, gt_negation) # n2=count_negation3(words, lm_pos, gt_negation) # if n1 != n2: # print('EEEEEEEEEEEEEEEEEEEEEEEEEEor') # exit() # n_negation_lm.append(n1) n_negation_lm.append(count_negation3(words, lm_pos, gt_negation)) ### General Inquirer dictionary n_neg_gi.append(count_occurrence3(words, gi_neg)) n_pos_gi.append(count_occurrence3(words, gi_pos)) n_negation_gi.append(count_negation3(words, gi_pos, gt_negation)) ### Henry dictionary n_neg_hr.append(count_occurrence3(words, hr_neg)) n_pos_hr.append(count_occurrence3(words, hr_pos)) n_negation_hr.append(count_negation3(words, gi_pos, gt_negation)) ## 4. readability fre_i = textstat.flesch_reading_ease(contents) if fre_i > 100: fre_i = 100 if fre_i < 0: fre_i = float('NaN') fre.append(fre_i) fkg_i = textstat.flesch_kincaid_grade(contents) if fkg_i < 0: fkg_i = float('NaN') fkg.append(fkg_i) #RIX cl_i = textstat.coleman_liau_index(contents) if cl_i < 0: cl_i = float('NaN') cl.append(cl_i) fog.append(textstat.gunning_fog(contents)) ari.append(textstat.automated_readability_index(contents)) smog.append(textstat.smog_index(contents))
def test_sentence_count(): count = textstat.sentence_count(long_test) assert count == 16
def compute_corpus_statistics(window, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, excludeStopWords=True, lemmatizeWords=True): filesToOpen = [] outputFilenameCSV = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.csv', 'corpus', 'stats') filesToOpen.append(outputFilenameCSV) inputDocs = IO_files_util.getFileList(inputFilename, inputDir, fileType='.txt') # read_line(inputFilename, inputDir, outputDir) # return Ndocs = str(len(inputDocs)) fieldnames = [ 'Number of documents in corpus', 'Document ID', 'Document', 'Number of Sentences in Document', 'Number of Words in Document', 'Number of Syllables in Document', 'Word1', 'Frequency1', 'Word2', 'Frequency2', 'Word3', 'Frequency3', 'Word4', 'Frequency4', 'Word5', 'Frequency5', 'Word6', 'Frequency6', 'Word7', 'Frequency7', 'Word8', 'Frequency8', 'Word9', 'Frequency9', 'Word10', 'Frequency10', 'Word11', 'Frequency11', 'Word12', 'Frequency12', 'Word13', 'Frequency13', 'Word14', 'Frequency14', 'Word15', 'Frequency15', 'Word16', 'Frequency16', 'Word17', 'Frequency17', 'Word18', 'Frequency18', 'Word19', 'Frequency19', 'Word20', 'Frequency20' ] if IO_csv_util.openCSVOutputFile(outputFilenameCSV): return IO_user_interface_util.timed_alert(GUI_util.window, 2000, 'Analysis start', 'Started running corpus statistics at', True) with open(outputFilenameCSV, 'w', encoding='utf-8', errors='ignore', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() #print("Number of corpus text documents: ",Ndocs) #currentLine.append([Ndocs]) index = 0 for doc in inputDocs: head, tail = os.path.split(doc) index = index + 1 # currentLine.append([index]) print("Processing file " + str(index) + "/" + str(Ndocs) + " " + tail) #currentLine.append([doc]) fullText = (open(doc, "r", encoding="utf-8", errors="ignore").read()) Nsentences = str(textstat.sentence_count(fullText)) #print('TOTAL number of sentences: ',Nsentences) Nwords = str(textstat.lexicon_count(fullText, removepunct=True)) #print('TOTAL number of words: ',Nwords) Nsyllables = textstat.syllable_count(fullText, lang='en_US') #print('TOTAL number of Syllables: ',Nsyllables) # words = fullText.split() words = nltk.word_tokenize(fullText) if excludeStopWords: words = excludeStopWords_list(words) if lemmatizeWords: lemmatizer = WordNetLemmatizer() text_vocab = set( lemmatizer.lemmatize(w.lower()) for w in fullText.split(" ") if w.isalpha()) words = set( lemmatizing(w.lower()) for w in words if w.isalpha()) # fullText.split(" ") if w.isalpha()) word_counts = Counter(words) # 20 most frequent words #print("\n\nTOP 20 most frequent words ----------------------------") # for item in word_counts.most_common(20): # print(item) # currentLine=[[Ndocs,index,doc,Nsentences,Nwords,Nsyllables]] currentLine = [[ Ndocs, index, IO_csv_util.dressFilenameForCSVHyperlink(doc), Nsentences, Nwords, Nsyllables ]] for item in word_counts.most_common(20): currentLine[0].append(item[0]) # word currentLine[0].append(item[1]) # frequency writer = csv.writer(csvfile) writer.writerows(currentLine) csvfile.close() # compute statistics about doc length grouped by Document list = ['Document ID'] tempOutputfile = statistics_csv_util.compute_field_statistics_groupBy( window, outputFilenameCSV, outputDir, list, openOutputFiles, createExcelCharts, 4) # ,4) # 'number of words in doc' if tempOutputfile != None: filesToOpen.extend(tempOutputfile) IO_user_interface_util.timed_alert( GUI_util.window, 2000, 'Analysis end', 'Finished running corpus statistics at', True) if createExcelCharts == True: columns_to_be_plotted = [[1, 3], [1, 4]] hover_label = ['Document', 'Document'] inputFilename = outputFilenameCSV Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='corpus_stats', chart_type_list=["bar"], # chart_title='Corpus statistics\nCorpus directory: '+inputDir, chart_title= 'Corpus Statistics: Frequency of Sentences & Words by Document', column_xAxis_label_var='Document', hover_info_column_list=hover_label) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # TODO # we should create 10 classes of values by distance to the median of # each value in the Number of Words in Document Col. E # -0-10 11-20 21-30,… 91-100 # and plot them as column charts. if openOutputFiles == True: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen) return filesToOpen
def avg_sentence_length(text): words = word_count(text) sentences = textstat.sentence_count(text) average_sentence_length = float(words / sentences) return average_sentence_length
instance_ids = [] ttr = [] mtld = [] number_of_words = [] readability = [] unique_words = [] avg_sentence_length = [] avg_word_length = [] for i, instance in enumerate(text_data): # lexical richness instance for entry before lemmatization and stopword removal lex_with_stopwords = LexicalRichness(instance) number_of_words.append(lex_with_stopwords.words) # mean sentence length mean_sentence_len = int(lex_with_stopwords.words / textstat.sentence_count(instance)) avg_sentence_length.append(mean_sentence_len) # mean word length num_chars = sum([len(w) for w in tokenizer.tokenize(instance)]) mean_word_len = round(num_chars / lex_with_stopwords.words, 1) avg_word_length.append(mean_word_len) # readability readability.append(textstat.flesch_reading_ease(instance)) # remove stopwords & lemmatize lemmatizer = WordNetLemmatizer() instance_no_stopwords = remove_stopwords(instance) new_instance = ' '.join([ lemmatizer.lemmatize(w)
def compute_syntactic(df, col): for ind, row in df.iterrows(): df.loc[ind, 'syllable_count'] = textstat.syllable_count(str(row[col])) #df.loc[ind,'lexicon_count'] = textstat.lexicon_count(str(row[col]), removepunct=True) df.loc[ind, 'sentence_count'] = textstat.sentence_count(str(row[col])) return df
def get_redability_assessments(data_text: str) -> Optional[dict]: divided_text = tokenize.sent_tokenize(data_text) word_tokenizes = nltk.word_tokenize(data_text) pos_tags = nltk.pos_tag(word_tokenizes) pos_tags_tagger = TAGGER.tag(word_tokenizes) f_dist = nltk.FreqDist(word_tokenizes) uniqueWordCount = compute_unique_word_count(f_dist.most_common()) paragraphCount = max(len(data_text.split('\n')), len(data_text.split('\r\n'))) counts = Counter(tag for word, tag in pos_tags) # Readability Grade Levels readability_grade_levels = dict(fleschKincaid=0, gunningFog=0, colemanLiau=0, smog=0, ari=0, forecastGradeLevel=0, powersSumnerKearlGrade=0, rix=0, raygorReadability=0, fryReadability=0, flesch=0) readability_grade_levels.update(fleschKincaid=textstat.flesch_kincaid_grade(data_text)) readability_grade_levels.update(gunningFog=textstat.gunning_fog(data_text)) readability_grade_levels.update(colemanLiau=textstat.coleman_liau_index(data_text)) readability_grade_levels.update(smog=textstat.smog_index(data_text)) readability_grade_levels.update(ari=textstat.automated_readability_index(data_text)) readability_grade_levels.update(rix=textstat.rix(data_text)) # need to check readability_grade_levels.update(forcastGradeLevel=round(20 - (textstat.avg_syllables_per_word(data_text) / 10), 2)) readability_grade_levels.update(powersSumnerKearlGrade=round(textstat.avg_sentence_length(data_text) + textstat.avg_syllables_per_word(data_text) + 2.7971, 2)) readability_grade_levels.update(raygorReadability=count_raygor_readability(divided_text)) readability_grade_levels.update(fryReadability=count_fry_readability(divided_text)) # need to check readability_grade_levels.update(flesch=textstat.flesch_reading_ease(data_text)) # Readability Scores readability_scores = dict(readableRating="", fleschReadingEase=0, cefrLevel='', ieltsLevel='', spacheScore=0, newDaleChallScore=0, lixReadability=0, lensearWrite=0) readability_scores.update(readableRating=count_average_grade_levels(readability_grade_levels)) readability_scores.update(fleschReadingEase=textstat.flesch_reading_ease(data_text)) readability_scores.update(cefrLevel=count_cefr_levels(readability_grade_levels)) readability_scores.update(ieltsLevel=count_ielts_levels(readability_grade_levels)) readability_scores.update(spacheScore=round(textstat.spache_readability(data_text), 2)) readability_scores.update(newDaleChallScore=textstat.dale_chall_readability_score_v2(data_text)) readability_scores.update(lixReadability=textstat.lix(data_text)) readability_scores.update(lensearWrite=textstat.linsear_write_formula(data_text)) # Text Statistics text_statistics = dict(characterCount=0, syllableCount=0, wordCount=0, uniqueWordCount=0, sentenceCount=0, paragraphCount=0) text_statistics.update(characterCount=textstat.char_count(data_text)) text_statistics.update(syllableCount=textstat.syllable_count(data_text)) text_statistics.update(wordCount=textstat.lexicon_count(data_text)) text_statistics.update(uniqueWordCount=uniqueWordCount) text_statistics.update(sentenceCount=textstat.sentence_count(data_text)) text_statistics.update(paragraphCount=paragraphCount) # Timings timings_statistics = dict(readingTime=0, speakingTime=0) timings_statistics.update(readingTime=reading_time(textstat.lexicon_count(data_text))) timings_statistics.update(speakingTime=speaking_time(textstat.lexicon_count(data_text))) # Text Composition text_composition = dict(adjectives=0, adverbs=0, conjunctions=0, determiners=0, interjections=0, nouns=0, verbs=0, properNouns=0, prepositions=0, pronouns=0, qualifiers=0, unrecognised=0, nonWords=0) text_composition.update(adjectives=counts.get('JJ', 0) + counts.get('JJR', 0) + counts.get('JJS', 0)) text_composition.update(adverbs=counts.get('RB', 0) + counts.get('RBR', 0) + counts.get('RBS', 0)) text_composition.update(conjunctions=counts.get('CC', 0)) text_composition.update(determiners=counts.get('DT', 0) + counts.get('PDT', 0) + counts.get('WDT', 0)) text_composition.update(interjections=counts.get('UH', 0)) text_composition.update(nouns=counts.get('NN', 0) + counts.get('NNS', 0)) text_composition.update( verbs=counts.get('VB', 0) + counts.get('VBD', 0) + counts.get('VBG', 0) + counts.get('VBN', 0) + counts.get( 'VBP', 0) + counts.get('VBZ', 0)) text_composition.update(properNouns=counts.get('NNP', 0) + counts.get('NNPS', 0)) text_composition.update(prepositions=counts.get('IN', 0)) text_composition.update( pronouns=counts.get('PRP', 0) + counts.get('PRP$', 0) + counts.get('WP', 0) + counts.get('WP$', 0)) text_composition.update(qualifiers=counts.get('RB', 0)) text_composition.update(unrecognised=counts.get(None, 0)) text_composition.update(nonWords=counts.get('.', 0) + counts.get(',', 0) + counts.get(':', 0)) # Readability Issues text_readability_issues = dict(sentences30SyllablesCount=0, sentences20SyllablesCount=0, sentences30Syllables=[], sentences20Syllables=[], words4SyllablesCount=0, words12LettersCount=0, words4Syllables=[], words12Letters=[]) sentences_30_syllables, sentences_30_count, sentences_20_syllables, sentences_20_count = count_sentences_syllables( divided_text) sentences_30_syllables = find_limit_offcet(data_text, sentences_30_syllables, "sentences_30_syllables", "sentences_30_syllables", "This sentence has more than 30 syllables. Consider rewriting it to be shorter or splitting it into smaller sentences.", "Readability Issues") sentences_20_syllables = find_limit_offcet(data_text, sentences_20_syllables, "sentences_20_syllables", "sentences_20_syllables", "This sentence has more than 20 syllables. Consider rewriting it to be shorter or splitting it into smaller sentences.", "Readability Issues") text_readability_issues.update(sentences30SyllablesCount=sentences_30_count, sentences20SyllablesCount=sentences_20_count) words_12_letters, words_12_count, words_4_syllables, words_4_count = words_sentence_syllables(divided_text) words_12_letters = find_limit_offcet(data_text, words_12_letters, "words_12_letters", "words_12_letters", "This word is more than 12 letters", "Readability Issues") words_4_syllables = find_limit_offcet(data_text, words_4_syllables, "words_4_syllables", "words_4_syllables", "This word is more than 4 syllables", "Readability Issues") text_readability_issues.update(words4SyllablesCount=words_4_count, words12LettersCount=words_12_count) # Writing Style Issues text_style_issues = dict(passiveVoiceCount=0, passiveVoices=[], adverbsCount=0, adverbs=[], clicheCount=0, cliches=[]) passive_voises_return = find_passives(divided_text) passive_voises_return = find_limit_offcet(data_text, passive_voises_return, "passive_voises", "passive_voises", "Too much of using passive voises", "Writing Style Issues") adverbs_return = find_adverbs(pos_tags_tagger) adverbs_return = find_limit_offcet(data_text, adverbs_return, "adverbs", # writing_style_issues "adverbs", "Too much of using adverbs", "Writing Style Issues") text_style_issues.update(passiveVoiceCount=len(passive_voises_return), adverbsCount=len(adverbs_return)) # Text Density Issues text_density_issues = dict(charactersPerWord=0, syllablesPerWord=0, wordsPerSentence=0, wordsPerParagraph=0, sentencesPerParagraph=0) text_density_issues.update(charactersPerWord=textstat.avg_character_per_word(data_text), syllablesPerWord=textstat.avg_syllables_per_word(data_text), wordsPerSentence=round(textstat.lexicon_count(data_text) / len(divided_text), 2), wordsPerParagraph=round(textstat.lexicon_count(data_text) / paragraphCount, 2), sentencesPerParagraph=round(len(divided_text) / paragraphCount, 2)) # Language Issues text_language_issues = dict(spellingIssuesCount=0, grammarIssueCount=0) matches_limit_offcet = sentences_20_syllables + sentences_30_syllables + words_4_syllables + words_12_letters + \ passive_voises_return + adverbs_return return dict(readabilityGradeLevels=readability_grade_levels, readabilityScores=readability_scores, textStatistics=text_statistics, timings=timings_statistics, textComposition=text_composition, textReadabilityIssues=text_readability_issues, textStyleIssues=text_style_issues, textDensityIssues=text_density_issues, textLanguageIssues=text_language_issues, matches=matches_limit_offcet)
pol = lambda x: TextBlob(x).sentiment.polarity sub = lambda x: TextBlob(x).sentiment.subjectivity df_clean_train['subjectivity'] = df_clean_train['Sentence'].apply(sub) df_clean_train['polar'] = df_clean_train['Sentence'].apply(pol) df_clean_train['Polarity'] = df_train['Polarity'] df_clean_train['review_len'] = df_clean_train['Sentence'].astype(str).apply(len) df_clean_train['word_count'] = df_clean_train['Sentence'].apply(lambda x: len(str(x).split())) df_clean_train['syllable_count'] = df_clean_train['Sentence'].apply(lambda x: textstat.syllable_count(x)) df_clean_train['lexicon_count'] = df_clean_train['Sentence'].apply(lambda x: textstat.lexicon_count(x)) df_clean_train['sentence_count'] = df_clean_train['Sentence'].apply(lambda x: textstat.sentence_count(x)) df_clean_train['flesch_reading_ease'] = df_clean_train['Sentence'].apply(lambda x: textstat.flesch_reading_ease(x)) df_clean_train['flesch_kincaid_grade'] = df_clean_train['Sentence'].apply(lambda x: textstat.flesch_kincaid_grade(x)) df_clean_train['gunning_fog'] = df_clean_train['Sentence'].apply(lambda x: textstat.gunning_fog(x)) df_clean_train.head() from sklearn.feature_extraction.text import TfidfVectorizer tfidf_vectorizer = TfidfVectorizer(min_df=.02, max_df=.5, ngram_range=[1,3], max_features=500, stop_words='english') dtm_tfidf = tfidf_vectorizer.fit_transform(df_clean_train['Sentence']) bow_df_tfidf = pd.DataFrame(dtm_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names(), index=df_clean_train.index) bow_df_tfidf.shape df_bow_tfidf = pd.concat([df_clean_train, bow_df_tfidf], axis=1)
def main(dir: str): checker = language_tool_python.LanguageTool('en-US') emails = {} totalWords = '' filenames = [ filename for filename in os.listdir(dir) if filename.endswith('.eml') ] for filename in filenames: print() print('[INFO] Processing {}...'.format(filename)) with open(os.path.join(dir, filename), 'r', encoding='latin1') as file: try: mail = mailparser.parse_from_file_obj(file) except Exception as e: print('[WARNING] Error while parsing: {}'.format(e)) continue # filter duplicates based on subject #if mail.subject in emails: # print('[WARNING] This email seems to be a duplicate of "{}"! Skipping...' # .format(emails[mail.subject]['filename'])) # continue # don't process if auth results missing # if 'Authentication-Results' not in mail.headers: # print('[WARNING] This email is missing an authentication results header! Skipping...') # continue attachments = '' for attachment in mail.attachments: attachment['filename'] = re.sub(r'<|>', '', attachment['filename']) try: mail.write_attachments(dir) for attachment in mail.attachments: if re.search('image', attachment['mail_content_type']): if re.search('gif', attachment['mail_content_type']): images, _, _ = gif2numpy.convert( dir + '\\' + attachment['filename']) img = images[0] else: img = cv2.imread(dir + '\\' + attachment['filename']) img = cv2.resize(img, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC) text = pytesseract.image_to_string(img) attachments += text elif re.search('pdf', attachment['mail_content_type']): encoding = chardet.detect( pdf_to_text(dir + '\\' + attachment['filename']))['encoding'] attachments += pdf_to_text( dir + '\\' + attachment['filename']).decode(encoding) # elif re.search('text', attachment['mail_content_type']): # #print(chardet.detect((attachment['payload']).encode())) # #encoding = chardet.detect(base64.b64decode(attachment['payload']).encode())['encoding'] # #attachments += base64.b64decode(attachment['payload']).decode(encoding) # #print(codecs.encode(base64.b64decode(attachment['payload']), encoding=attachment['content_transfer_encoding'])) # attachments += attachment['payload'] else: attachments += attachment['payload'] os.remove(dir + '\\' + attachment['filename']) except Exception as e: print( '[WARNING] Error while parsing attachments: {}'.format(e)) [ os.remove(dir + '\\' + attachment['filename']) for attachment in mail.attachments ] body = mail.subject + ' ' + \ remove_noise(BeautifulSoup(mail.body, 'lxml').get_text(separator=' ', strip=True) + BeautifulSoup(attachments, 'lxml').get_text()) blob = TextBlob(body) totalWords = totalWords + " " + body.lower() grammarErrors = checker.check(body) if 'Authentication-Results' in mail.headers: spf = re.findall('spf=(\S*)', mail.headers['Authentication-Results']) dkim = re.findall('dkim=(\S*)', mail.headers['Authentication-Results']) dmarc = re.findall('dmarc=(\S*)', mail.headers['Authentication-Results']) else: spf = dkim = dmarc = '' emails[filename] = { 'filename': filename, # 'hops': mail.received[-1]['hop'], # 'totalDelay': sum([hop['delay']/60 for hop in mail.received]), 'spf': spf[0] if len(spf) else None, 'dkim': dkim[0] if len(dkim) else None, 'dmarc': dmarc[0] if len(dmarc) else None, 'subject': mail.subject, 'from': mail.from_[0][1], 'to': [tup[1] for tup in mail.to], 'replyTo': [tup[1] for tup in mail.reply_to], 'attachments': [x['filename'] for x in mail.attachments], 'grammarErrors': len(grammarErrors), 'counts': { 'characterCount': len(body), 'wordCount': textstat.lexicon_count(body), 'sentenceCount': textstat.sentence_count(body) }, 'readability': { 'flesch_kincaid': textstat.flesch_kincaid_grade(body), 'gunning_fog': textstat.gunning_fog(body), 'smog_index': textstat.smog_index(body), 'automated_readability_index': textstat.automated_readability_index(body), 'coleman_liau_index': textstat.coleman_liau_index(body), 'linsear_write': textstat.linsear_write_formula(body), }, 'sentiment': { 'polarity': blob.sentiment.polarity, 'subjectivity': blob.sentiment.subjectivity } } if save_body: emails[filename]['body'] = body ## quit if nothing found ## # if not emails: # print('[WARNING] No files were found in "{}"!'.format(dir)) # return ## writing all words to file ## with open(os.path.join(dir, 'words.txt'), 'w', encoding='utf-8') as file: file.write(totalWords.lower()) ## output json ## with open(os.path.join(dir, 'analysis.json'), 'w') as jsonFile: json.dump(emails, jsonFile, indent=2) ## build and output csv ## # generate and output headers using first email column_headers = list(flatten_json(emails[list(emails.keys())[0]]).keys()) csvFile = open(os.path.join(dir, 'analysis.csv'), 'w', encoding='utf-8') csvFile.write(',{}\n'.format(','.join(column_headers))) # generate and output one line per email for email in emails.keys(): # flatten json to 1 layer deep flattened_email = flatten_json(emails[email]) # generate the values for this row csv_values = [ '"' + str(flattened_email[column_header]) + '"' for column_header in column_headers ] # add email name and join w/ commas, then write out csvFile.write('{},{}\n'.format('"' + email + '"', ','.join(csv_values))) csvFile.close() # print out stats print('{}/{} processed. The remaining failed for some reason.'.format( len(emails), len(filenames)))
def sentence_count(text): return textstat.sentence_count(text)
signal.signal(signal.SIGINT, handler) d = "/home/adulau/dess/12/01" ld = os.listdir(d) stats = {} stats['hits'] = 0 stats['miss'] = 0 for f in ld: currentfile = os.path.join(d, f) with gzip.open(currentfile) as paste: content = paste.read().decode('utf-8') lexicon = textstat.lexicon_count(content, removepunct=True) syllabe = textstat.syllable_count(content, lang='en_US') sentence = textstat.sentence_count(content) # consensus = textstat.text_standard(content, float_output=False) # print ("sentence={}, syllabe={}, lexicon={}, flesch_reading_score={},{}".format(sentence, syllabe, lexicon, textstat.flesch_reading_ease(content), currentfile)) analysis = {} analysis['sentence'] = sentence analysis['syllabe'] = syllabe analysis['lexicon'] = lexicon analysis['flesch_reading_ease'] = textstat.flesch_reading_ease(content) analysis['filename'] = currentfile analysis['length'] = len(content) analysis['extract'] = content[:100] #rank = (analysis['flesch_reading_ease']+analysis['flesch_reading_ease']+analysis['lexicon'])*analysis['sentence'] rank = analysis['flesch_reading_ease'] if analysis['flesch_reading_ease'] >= 0 and analysis[ 'flesch_reading_ease'] <= 900:
def test_sentence_count(self): count = textstat.sentence_count(self.long_test) self.assertEqual(16, count)
############################################################################### # Readability scores: Greta Thunberg import textstat import numpy as np # drop empty text fields temp = greta.copy() temp['text'].replace('', np.nan, inplace=True) temp['text'].replace(' ', np.nan, inplace=True) temp.dropna(subset=['text'], inplace=True) temp['syl_count'] = temp.text.apply(lambda x: textstat.syllable_count(x)) temp['word_count'] = temp.text.apply( lambda x: textstat.lexicon_count(x, removepunct=True)) temp['sent_count'] = temp.text.apply(lambda x: textstat.sentence_count(x)) temp['score_fre'] = temp.text.apply(lambda x: textstat.flesch_reading_ease(x)) temp['score_are'] = temp.text.apply( lambda x: textstat.automated_readability_index(x)) temp['char_count'] = temp.text.apply(lambda x: len(x)) sns.distplot(temp.word_count, hist=True, kde=False, norm_hist=True, color='darkblue', hist_kws={'edgecolor': 'black'}) fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(nrows=2, ncols=2, figsize=(8, 6)) fig.subplots_adjust(hspace=.5) sns.despine()
def test_sentence_count(): count = textstat.sentence_count(long_test) assert count == 16
def test_sentence_count(): textstat.set_lang("en_US") count = textstat.sentence_count(long_test) assert count == 16
] for token in final_tokens: synset = wsd.lesk(sentence, token) if not synset is None: SynSets.append(synset) synsDic[synset] = token SynSets = set(SynSets) SynSets = sorted(SynSets) with open("synsets.txt", "a") as file: file.write("\n---------------------\n") for synset in SynSets: file.write("{} -> {}\n".format(str(synset.__str__()), synsDic[synset])) file.close() sentencesCount = textstat.sentence_count(text) print("Sentences: ", sentencesCount) prepro = preprocess(text) # # for i in prepro: # # print(i,"\n") print("Tokens inc:", len(prepro)) pattern = 'NP: {<DT>?<JJ>*<NN>}' cp = nltk.RegexpParser(pattern) cs = cp.parse(prepro) print("Tokens ex:", len(cs)) iob_tagged = tree2conlltags(cs) ne_tree = ne_chunk(pos_tag(word_tokenize(text))) nlp = en_core_web_sm.load()
def sentence_count(corpus): return np.array([textstat.sentence_count(doc) for doc in corpus]).reshape(-1, 1)
# print(downloaded) if url != 0 and downloaded: from readability.readability import Document from html2text import html2text readable_article = Document(downloaded).summary() raw = html2text(readable_article) print(raw) # Lexicon Count - number of words present in the text lexicon_count = textstat.lexicon_count(raw, removepunct=True) worksheet.update("C" + row, lexicon_count) # Sentence Count sentence_count = textstat.sentence_count(raw) worksheet.update("D" + row, sentence_count) # The Flesch Reading Ease formula # 90-100 - Very Easy | 80-89 - Easy | 70-79 - Fairly Easy | 60-69 - Standard # 50-59 - Fairly Difficult | 30-49 - Difficult | 0-29 - Very Confusing flesch_reading_ease = textstat.flesch_reading_ease(raw) worksheet.update("E" + row, flesch_reading_ease) # Flesch-Kincaid Grade Level # https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch%E2%80%93Kincaid_grade_level flesch_kincaid_grade = textstat.flesch_kincaid_grade(raw) worksheet.update("F" + row, flesch_kincaid_grade) # The Fog Scale (Gunning FOG Formula) # https://en.wikipedia.org/wiki/Gunning_fog_index