def count_fry_readability(data_list): sentence_numbers = 0 syllables_numbers = 0 # computation words_count = 0 for sentence in data_list: sentence_numbers = sentence_numbers + 1 syllables_numbers = syllables_numbers + textstat.syllable_count(sentence) words_count = words_count + textstat.lexicon_count(sentence) if words_count >= 150: break # computation words_count = 0 for sentence in reversed(data_list): sentence_numbers = sentence_numbers + 1 syllables_numbers = syllables_numbers + textstat.syllable_count(sentence) words_count = words_count + textstat.lexicon_count(sentence) if words_count >= 150: break avg_sentence_numbers = round(sentence_numbers / 3) avg_syllables_numbers = round(syllables_numbers / 3) return get_value_from_fry_graph(avg_sentence_numbers, avg_syllables_numbers)
def test_lexicon_count(): textstat.set_lang("en_US") count = textstat.lexicon_count(long_test) count_punc = textstat.lexicon_count(long_test, removepunct=False) assert count == 372 assert count_punc == 376
def lexicon_table(): lst_1 = [ textstat.lexicon_count(text) for text in insincere_questions.tolist() ] lst_2 = [ textstat.lexicon_count(text) for text in sincere_questions.tolist() ] table = build_table(lst_1, lst_2) py.plot(table, filename='lexicon_table')
def build_lexicon_dict(lst): dct = {} for t in lst: if textstat.lexicon_count(t) in dct: dct[textstat.lexicon_count(t)] = dct[textstat.lexicon_count(t)] + 1 else: dct[textstat.lexicon_count(t)] = 1 for key in dct.keys(): dct[key] = dct[key] / len(lst) sorted_tuple = sorted(dct.items(), key=operator.itemgetter(0)) return sorted_tuple
def transform(self, input_df: pd.DataFrame) -> coo_matrix: """ It computes and returns the linguistic features from the input DF. The DF must include the following attributes in its columns: Q_TEXT, Q_ID :param input_df: :return: """ if Q_TEXT not in input_df.columns: raise ValueError("Q_TEXT should be in input_df.columns") if Q_ID not in input_df.columns: raise ValueError("Q_ID should be in input_df.columns") correct_ans_text_dict = gen_correct_answers_dict(input_df) wrong_ans_text_dict = gen_wrong_answers_dict(input_df) df = pd.DataFrame() df['lexicon_count_question'] = input_df.apply( lambda r: textstat.lexicon_count(r[Q_TEXT]), axis=1) df['lexicon_count_correct_choices'] = input_df.apply( lambda r: np.mean([ textstat.lexicon_count(x) for x in correct_ans_text_dict[r[Q_ID]] ]), axis=1) df['lexicon_count_wrong_choices'] = input_df.apply(lambda r: np.mean( [textstat.lexicon_count(x) for x in wrong_ans_text_dict[r[Q_ID]]]), axis=1) df['sentence_count_question'] = input_df.apply( lambda r: textstat.sentence_count(r[Q_TEXT]), axis=1) df['sentence_count_correct_choices'] = input_df.apply( lambda r: np.mean([ textstat.sentence_count(x) for x in correct_ans_text_dict[r[Q_ID]] ]), axis=1) df['sentence_count_wrong_choices'] = input_df.apply(lambda r: np.mean( [textstat.sentence_count(x) for x in wrong_ans_text_dict[r[Q_ID]]]), axis=1) df['avg_word_len_question'] = input_df.apply( lambda r: np.mean([len(x) for x in r[Q_TEXT].split(' ')]), axis=1) df['ratio_len_question_correct_choices'] = df.apply( lambda r: (1 + r['lexicon_count_question']) / (1 + r['lexicon_count_correct_choices']), axis=1) df['ratio_len_question_wrong_choices'] = df.apply( lambda r: (1 + r['lexicon_count_question']) / (1 + r['lexicon_count_wrong_choices']), axis=1) return coo_matrix(df.values)
def feature_getter(text): try: text=text.decode('utf-8') except: pass text1=re.sub(r'[^\x00-\x7F]+',' ', text) ##text1=re.sub('\n','. ', text) text=text1 features=[] tokens=[] sentences = nltk.sent_tokenize(text) [tokens.extend(nltk.word_tokenize(sentence)) for sentence in sentences] syllable_count = textstat.syllable_count(text, lang='en_US') word_count = textstat.lexicon_count(text, removepunct=True) flesch = textstat.flesch_reading_ease(text) readability = textstat.automated_readability_index(text) features.append(len(sentences)) #num_sentences features.append(syllable_count) #num_sentences features.append(word_count) #num_sentences features.append(flesch) #num_sentences features.append(readability) #num_sentences return features
def textstat_stats(text): doc_length = len(text.split()) flesch_ease = ts.flesch_reading_ease(text) #Flesch Reading Ease Score flesch_grade = ts.flesch_kincaid_grade(text) #Flesch-Kincaid Grade Level gfog = ts.gunning_fog(text) # FOG index, also indicates grade level # smog = ts.smog_index(text) # SMOG index, also indicates grade level, only useful on 30+ sentences auto_readability = ts.automated_readability_index(text) #approximates the grade level needed to comprehend the text. cl_index = ts.coleman_liau_index(text) #grade level of the text using the Coleman-Liau Formula. lw_formula = ts.linsear_write_formula(text) #grade level using the Linsear Write Formula. dcr_score = ts.dale_chall_readability_score(text) #uses a lookup table of the most commonly used 3000 English words # text_standard = ts.text_standard(text, float_output=False) # summary of all the grade level functions syll_count = ts.syllable_count(text, lang='en_US') syll_count_scaled = syll_count / doc_length lex_count = ts.lexicon_count(text, removepunct=True) lex_count_scaled = lex_count / doc_length idx = ['flesch_ease', 'flesch_grade','gfog', 'auto_readability','cl_index','lw_formula', 'dcr_score', # 'text_standard', 'syll_count', 'lex_count'] return pd.Series([flesch_ease, flesch_grade, gfog, auto_readability, cl_index, lw_formula, dcr_score, # text_standard, syll_count_scaled, lex_count_scaled], index = idx)
def calculate_stats(data_folder): """Calculate stat of test.json file in a folder""" data_folder = Path(data_folder) for dataset in dataset_fields: print(f"loading {dataset}") field = dataset_fields[dataset]["text"].strip() sentences = [] for item in json.load(open(data_folder / dataset / "test.json")): sentences.append(item[field][-1] if type(item[field]) == list else item[field]) text = " ".join(sentences) lex_count = textstat.lexicon_count(text) print(lex_count) unique_words = count_words(text) print(f"all unique {len(unique_words)}") lower_unique_words = count_words(text, casing="lower") print(f"lowercase unique {len(lower_unique_words)}") upper_unique_words = count_words(text, casing="upper") print(f"uppercase unique {len(upper_unique_words)}") print(f"ratio {len(upper_unique_words) / len(unique_words)}") text_standard = textstat.text_standard(text, float_output=True) print(f"text_standard: {text_standard}") dale_chall_readability_score = textstat.dale_chall_readability_score(text) print(f"dale_chall_readability_score: {dale_chall_readability_score}") flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) print(f"flesch_kincaid_grade: {flesch_kincaid_grade}")
def textStatistics(text): """ returns text statistics such as lexicon count and text standard in a tuple """ le_c = textstat.lexicon_count(text, removepunct=True) ts = textstat.text_standard(text, float_output=True) return le_c, ts
def statistics(self, text): self.asl = textstat.avg_sentence_length(text) self.avg_sentence_per_word = textstat.avg_sentence_per_word(text) self.avg_syllables_per_word = textstat.avg_syllables_per_word(text) self.difficult_words = textstat.difficult_words(text) self.lexicon_count = textstat.lexicon_count(text) self.polysyllable_count = textstat.polysyllabcount(text) self.sentence_count = textstat.sentence_count(text)
def flesch_kincaid(row): text = row['reviewText'] words = max(1, textstat.lexicon_count(text)) sentences = max(1, sentence_count(row)) syllables = textstat.syllable_count(text, lang='en_US') score = 206.835 - 1.015 * (float(words) / sentences) - 84.6 * (float(syllables) / words) return score
def get_raw_stats(book, text): return { 'total_words': textstat.lexicon_count(text), 'total_sentences': len(sent_tokenize(text)), 'total_letters': textstat.letter_count(text), 'total_syllables': textstat.syllable_count(text), # 'total_paragraphs': len(get_paragraphs(text)), # 'average_word_difficulty': get_average_frequency(book) }
def __init__(self, text): self.__doc = self.preprocess(text) self.__docWords = self.getTotalWords() self.__totalWords = textstat.lexicon_count(self.__doc, removepunct=True) self.__totalCharacters = textstat.char_count(self.__doc, ignore_spaces=True) self.__totalSentences = self.getSentencesCount() self.__totalSyllables = self.getSyllablesCount() # self.__totalSyllables = textstat.syllable_count(self.__doc) self.__polySyllableCount = self.getPolySyllableCount()
def count_raygor_readability(data_list): sentence_numbers = 0 words_count_bigger_six = 0 # computation words_count = 0 for sentence in data_list: sentence_numbers = sentence_numbers + 1 words_count = words_count + textstat.lexicon_count(sentence) words_count_bigger_six = words_count_bigger_six + len([1 for n in sentence.split() if len(n) > 6]) if words_count >= 50: break # computation words_count = 0 for sentence in reversed(data_list): sentence_numbers = sentence_numbers + 1 words_count = words_count + textstat.lexicon_count(sentence) words_count_bigger_six = words_count_bigger_six + len([1 for n in sentence.split() if len(n) > 6]) if words_count >= 50: break return get_value_from_raygor_graph(sentence_numbers, words_count_bigger_six)
def feature_eng(df, text_col): org_cols = df.columns #length of original text df['Sent_length'] = df[text_col].apply(len) #more than 2 Exclamation mark (!!) - binary feature reg = re.compile("(!)\\1{1,}") has_more_than2_exm = lambda x: np.where(len(reg.findall(x)) == 0, 0, 1) df['more_that_2_exm'] = df[text_col].apply(has_more_than2_exm) df['words_count'] = df[text_col].apply(lambda x: textstat.lexicon_count(x)) num_cols = [col for col in df.columns if col not in org_cols] return num_cols
def sentence_fit(gen_text, orig_text): df = pd.DataFrame( gen_text, columns=['generated']) # Text generated from GPT2 stored in dataframe df['generated'] = df['generated'].str.replace(r' +,', ',').str.replace( r' +\.', '.') # Remove spaces in front of punctuation df['similarity'] = df['generated'].apply(lambda x: text_similarity( orig_text, x)) # Assess cosine similarity betweeen sentences df['n_syll'] = df['generated'].apply( textstat.syllable_count) # Count number of syllables df['n_lex'] = df['generated'].apply( textstat.lexicon_count) # Count number of words df['syll_lex'] = df['n_syll'] / df['n_lex'] # Syllable to word ratio # Flags to indicate whether generated text has fewer words, syallables, or syll to word ratio df['rel_syll'] = np.where( df['n_syll'] < textstat.syllable_count(orig_text), 1, 0) df['rel_lex'] = np.where(df['n_lex'] < textstat.lexicon_count(orig_text), 1, 0) df['rel_rat'] = np.where( df['syll_lex'] < textstat.syllable_count(orig_text) / textstat.lexicon_count(orig_text), 1, 0) # Sum binary indicators of relative sentence simplicity df['rel_simp'] = (df['rel_syll'] + df['rel_lex'] + df['rel_rat']) / 3 # Fit score is weighted sum of similarity and relative sentence simplicity # Highest score will be chosen df['fit_score'] = 0.7 * df['similarity'] + 0.3 * df['rel_simp'] # Subset data and rename columns df['Original'] = orig_text df = df[['Original', 'generated', 'similarity', 'rel_simp', 'fit_score']] df.columns = [ 'Original', 'Generated', 'Similarity', 'Simplicity', 'Fit Score' ] return df
def word_count(class_name): total_word = 0 for idx, items in enumerate(df['genre']): if items == class_name: book_id = df.at[idx, 'book_id'] content = df.at[idx, 'content'] no_of_words = textstat.lexicon_count(str(content), removepunct=True) total_word = total_word + no_of_words det_and_mys_dict[book_id] = (no_of_words) min_word_id = min(det_and_mys_dict.items(), key=lambda x: x[1]) max_word_id = max(det_and_mys_dict.items(), key=lambda x: x[1]) return (min_word_id, max_word_id, total_word)
def analyze_vocab(text): return { 'num_words': textstat.lexicon_count(text), 'flesch_reading_ease': textstat.flesch_reading_ease(text), 'smog_index': textstat.smog_index(text), 'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text), 'coleman_liau_index': textstat.coleman_liau_index(text), 'automated_readability_index': textstat.automated_readability_index(text), 'dale_chall_readability_score': textstat.dale_chall_readability_score(text), 'difficult_words': textstat.difficult_words(text), 'linsear_write_formula': textstat.linsear_write_formula(text), 'gunning_fog': textstat.gunning_fog(text), 'text_standard': textstat.text_standard(text, float_output=True) }
def lisibilty(text): f_lis = ([ textstat.syllable_count(str(text), lang='en_arabic'), textstat.lexicon_count(str(text), removepunct=True), textstat.sentence_count(str(text)), textstat.flesch_reading_ease(str(text)), textstat.flesch_kincaid_grade(str(text)), textstat.gunning_fog(str(text)), textstat.smog_index(str(text)), textstat.automated_readability_index(str(text)), textstat.coleman_liau_index(str(text)), textstat.linsear_write_formula(str(text)), textstat.dale_chall_readability_score(str(text)) ]) return f_lis
def dale_chall(row): text = row['reviewText'] easywords = open("easy_words.txt").read().splitlines() words = tokenize.word_tokenize(text) words = list(map(lambda x: x.lower(), words)) sentences = max(1, sentence_count(row)) wordcount = max(1, textstat.lexicon_count(text)) easywordcount = 0 for easyword in easywords: easywordcount += words.count(easyword) diffwordsratio = (float(wordcount - easywordcount) / wordcount) score = 0.1579 * (diffwordsratio * 100) + 0.0496 * (float(wordcount) / sentences) if diffwordsratio > 0.05: score += 3.635 return score
def add_features(row): '''Feature engineering via NLP.''' text = row.text doc = nlp(text) lemmas = list() entities = list() for token in doc: if token.text == ':': row['has_colon'] = 1 if token.text == ';': row['has_semicolon'] = 1 if token.text == '-': row['has_dash'] = 1 if token.text.lower() == 'whom': row['whom'] = 1 if token.text[-3:] == 'ing': row['num_ings'] += 1 if token.text.lower() == 'had': row['has_had'] = 1 pos = token.pos_ row[pos] += 1 if token.is_stop or not token.is_alpha: continue lemma = token.lemma_.strip().lower() if lemma: lemmas.append(lemma) for ent in doc.ents: entities.append(ent.text) lemmas = ' '.join(lemmas) blob = TextBlob(text) row['subjectivity'] = blob.sentiment.subjectivity row['polarity'] = blob.sentiment.polarity row['starts_conj'] = int(doc[0].pos_ == 'CONJ') row['ends_prep'] = int(doc[0].pos_ == 'PREP') row['entities'] = entities row['lemmas'] = lemmas row['raw_text_length'] = len(text) row['num_words'] = len(doc) row['avg_word_len'] = row.raw_text_length / row.num_words row['vector_avg'] = np.mean(nlp(lemmas).vector) row['num_ings'] /= row['num_words'] row['rhyme_frequency'] = rhyme_frequency(row['text']) row['dale_chall'] = textstat.dale_chall_readability_score(row['text']) row['FleischReadingEase'] = textstat.flesch_reading_ease(row['text']) row['lexicon'] = textstat.lexicon_count(row['text']) row['word_diversity'] = row.lexicon / row.num_words return row
def get_desc_data(string): ''' Input: book description string Output: returns desc_len, num_unique_words, avg_word_len ''' #Data before text processing desc_semantic = get_semantic(string) syl_count = syllable_count(string) lex_count = lexicon_count(string) sent_count = sentence_count(string) flesch = flesch_reading_ease(string) #Data after text processing string = text_preprocess(string) word_cnt = word_count(string) description_len = desc_len(string) number_unique_words = num_unique_words(string) average_word_len = avg_word_len(string) return desc_semantic, word_cnt, description_len, number_unique_words, \ average_word_len, syl_count, lex_count, sent_count, flesch
def process_datum(datum): # Remove tags soup = BeautifulSoup(datum["Content"], features="html.parser") clean_soup = BeautifulSoup(datum["Content"], features="html.parser") for elm in clean_soup(["code"]): elm.extract() body_text = clean_soup.get_text() pos_tags = pos_tag(word_tokenize(body_text)) pos_counts = Counter([tag for word, tag in pos_tags]) # preterm_counts = result = {} result['TEXT'] = body_text result['CT1'] = lexicon_count(body_text) result['CT2'] = sentence_count(body_text) for tag in POS_TAGS: result['CT3.' + tag] = pos_counts[tag] # for preterm in PRETERMINALS: # results['CT4.' + preterm] = result['CN1'] = len(soup.find_all("code", href=True)) +\ len(soup.find_all("img", href=True)) +\ len(soup.findAll("span", {"class": "math-container"})) result['CN2'] = len(soup.find_all("a", href=True)) result['U1.SUM'] = datum['U1.SUM'] result['U1.1'] = datum['U1.1'] result['U1.2'] = datum['U1.2'] result['U2'] = datum['U2'] result['Y1'] = datum['Y1'] result['Y2'] = datum['Y2'] result['T'] = datum['T'] result['S'] = datum['S'] result['D'] = datum['D'] return result
def lexical_counts(sent): return textstat.lexicon_count(sent, removepunct=True)
def test_lexicon_count(self): count = textstat.lexicon_count(self.long_test) count_punc = textstat.lexicon_count(self.long_test, removepunct=False) self.assertEqual(372, count) self.assertEqual(376, count_punc)
def lexicon_count(corpus): return np.array([textstat.lexicon_count(doc) for doc in corpus]).reshape(-1, 1)
from stop_words import safe_get_stop_words from readability import Readability import textstat csv.field_size_limit(500 * 1024 * 1024) file1 = open('gfi_description.csv') csv_reader = csv.reader(file1) header = next(csv_reader) file2 = open('gfi_description_attributes.csv', 'w') csv_writer = csv.writer(file2) csv_writer.writerow([ 'id', 'num_url', 'num_image', 'num_code', 'num_comment', 'num_table', 'count_word_title', 'count_word_body', 'readability' ]) for item in csv_reader: title = item[1] body = item[2] count_word_title = textstat.lexicon_count(title, removepunct=True) count_word_body = textstat.lexicon_count(body, removepunct=True) readability = textstat.coleman_liau_index(body) print(count_word_title, count_word_body, readability) csv_writer.writerow([ item[0], item[3], item[4], item[5], item[6], item[7], count_word_title, count_word_body, readability ]) file1.close() file2.close()
def test_lexicon_count(): count = textstat.lexicon_count(long_test) count_punc = textstat.lexicon_count(long_test, removepunct=False) assert count == 372 assert count_punc == 376
# In[46]: print(average) # In[33]: textstat.flesch_reading_ease(text) # In[40]: textstat.automated_readability_index(text) # In[35]: textstat.lexicon_count(text, removepunct=True) # In[22]: textstat.syllable_count(text, lang='en_US') # In[82]: textstat.text_standard(text) # In[9]: len(set(cleaning_features(text))) # In[7]:
############################################################################### # Readability scores: Greta Thunberg import textstat import numpy as np # drop empty text fields temp = greta.copy() temp['text'].replace('', np.nan, inplace=True) temp['text'].replace(' ', np.nan, inplace=True) temp.dropna(subset=['text'], inplace=True) temp['syl_count'] = temp.text.apply(lambda x: textstat.syllable_count(x)) temp['word_count'] = temp.text.apply( lambda x: textstat.lexicon_count(x, removepunct=True)) temp['sent_count'] = temp.text.apply(lambda x: textstat.sentence_count(x)) temp['score_fre'] = temp.text.apply(lambda x: textstat.flesch_reading_ease(x)) temp['score_are'] = temp.text.apply( lambda x: textstat.automated_readability_index(x)) temp['char_count'] = temp.text.apply(lambda x: len(x)) sns.distplot(temp.word_count, hist=True, kde=False, norm_hist=True, color='darkblue', hist_kws={'edgecolor': 'black'}) fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(nrows=2, ncols=2, figsize=(8, 6)) fig.subplots_adjust(hspace=.5)
sys.exit(0) signal.signal(signal.SIGINT, handler) d = "/home/adulau/dess/12/01" ld = os.listdir(d) stats = {} stats['hits'] = 0 stats['miss'] = 0 for f in ld: currentfile = os.path.join(d, f) with gzip.open(currentfile) as paste: content = paste.read().decode('utf-8') lexicon = textstat.lexicon_count(content, removepunct=True) syllabe = textstat.syllable_count(content, lang='en_US') sentence = textstat.sentence_count(content) # consensus = textstat.text_standard(content, float_output=False) # print ("sentence={}, syllabe={}, lexicon={}, flesch_reading_score={},{}".format(sentence, syllabe, lexicon, textstat.flesch_reading_ease(content), currentfile)) analysis = {} analysis['sentence'] = sentence analysis['syllabe'] = syllabe analysis['lexicon'] = lexicon analysis['flesch_reading_ease'] = textstat.flesch_reading_ease(content) analysis['filename'] = currentfile analysis['length'] = len(content) analysis['extract'] = content[:100] #rank = (analysis['flesch_reading_ease']+analysis['flesch_reading_ease']+analysis['lexicon'])*analysis['sentence'] rank = analysis['flesch_reading_ease']