def simplicity(string): tokens = nltk.word_tokenize(string) DC = readability.getmeasures(tokens)['readability grades']['DaleChallIndex'] GF = readability.getmeasures(tokens)['readability grades']['GunningFogIndex'] C = readability.getmeasures(tokens)['readability grades']['Coleman-Liau'] #print(f'DC: {DC}, GF: {GF}, K: {K}') return DaleChall * DC + GunningFog * GF + Cole * C
def __init__(self, tweet_attributes): self.num_tweets = tweet_attributes['num_tweets'] self.num_hashtags = tweet_attributes['num_hashtags'] self.num_mentions = tweet_attributes['num_mentions'] self.num_urls = tweet_attributes['num_urls'] self.num_media = tweet_attributes['num_media'] self.num_symbols = tweet_attributes['num_symbols'] self.num_polls = tweet_attributes['num_polls'] self.text = tweet_attributes['text'] self.tokens = tweet_attributes['tokens'] self.token_lengths = [len(token) for token in self.tokens] self.tokens_lower = [token.lower() for token in self.tokens] self.pos_tags = nltk.pos_tag(self.tokens) self.sentences = sent_tokenize(self.text) self.num_emojis = emoji.core.emoji_count(self.text) self.norm_neg_senti, self.norm_neu_senti, self.norm_pos_senti, self.norm_com_senti = self.normalized_sentiment_values( ) try: self.readability_measures = readability.getmeasures(self.tokens, lang='en') except: self.readability_measures = readability.getmeasures(self.tokens + ['a'], lang='en') self.make_char_wise_pass() self.make_pos_tag_wise_pass() self.make_token_wise_pass() self.make_sliding_window_pass() self.num_errors = 0
def save_other_features(data, parse_lst_path, config, path, context=True, parse=True, multi=False): if multi: if 'complexity' in data: data_head, data_tail = multi_data(data[['id','corpus','sentence','token','complexity','class']]) else: data_head, data_tail = multi_data(data[['id','corpus','sentence','token']]) base, ext = os.path.splitext(path) path_head = base+'_head'+ext path_tail = base+'_tail'+ext omit = save_other_features(data_head, parse_lst_path, config, path_head, context=context, parse=parse) _ = save_other_features(data_tail, parse_lst_path, config, path_tail, context=context, parse=parse) multi_compute(data, path, path_head, path_tail, omit) return # based on aspect word data['word_len'] = data['token'].str.len().to_numpy() data['num_syllables'] = data['token'].apply(lambda x: syllables.estimate(str(x))).to_numpy() data['num_hyponyms'] = data.apply(lambda x: len(get_hyponyms(x['sentence'] if context else None, x['token'], disambiguate=config['disambiguate'] if context else False)), axis=1).to_numpy() data['num_hypernyms'] = data.apply(lambda x: len(get_hypernyms(x['sentence'] if context else None, x['token'], disambiguate=config['disambiguate'] if context else False)), axis=1).to_numpy() data['is_acronym'] = (data['token'].str.isupper()*1).to_numpy() data['is_pronoun'] = (data['token'].apply(lambda x: x[0].isupper())*1).to_numpy() # based on context omit = set() if context: corpus_dummies = pd.get_dummies(data['corpus'], prefix='corpus') for corpus_name in corpus_dummies: data[corpus_name] = corpus_dummies[corpus_name] omit.add(corpus_name) tagdict = load('help/tagsets/upenn_tagset.pickle') tags = [tag for tag in tagdict.keys() if tag[0] not in punctuation] POS = data.apply(lambda x: get_POS(x['sentence'], x['token']), axis=1) for tag in tags: data['POS_'+tag] = (POS == tag) * 1 funcs = ["textstat." + func[0] for func in inspect.getmembers(textstat, predicate=inspect.ismethod)] for elem in tqdm(funcs): method = eval(elem) if method.__name__ in ['difficult_words_list', 'set_lang', 'text_standard', 'dale_chall_readability_score_v2', 'dale_chall_readability_score', 'gunning_fog', 'spache_readability', 'avg_sentence_length', 'avg_sentence_per_word', 'sentence_count', 'difficult_words', 'is_difficult_word', 'is_easy_word', 'smog_index']: continue textstat.set_lang("en") data[method.__name__] = data['sentence'].apply(lambda x: method(x)).to_numpy() omit.add(method.__name__) data['SMOGIndex'] = data['sentence'].apply(lambda x: readability.getmeasures(x, lang='en')['readability grades']['SMOGIndex']).to_numpy() data['DaleChallIndex'] = data['sentence'].apply(lambda x: readability.getmeasures(x, lang='en')['readability grades']['DaleChallIndex']).to_numpy() omit.add('SMOGIndex'); omit.add('DaleChallIndex') if parse and parse_lst_path is not None: parse_lst = pkl.load(open(parse_lst_path, 'rb')) parse_tree_depths = [] token_depths = [] num_words_at_depths = [] for parse_tree, token in tqdm(zip(parse_lst, data['token'])): parse_tree_depths.append(parse_tree.height()) token_depths.append(token_depth(parse_tree, token)) num_words_at_depths.append(num_words_at_depth(parse_tree, token_depths[-1])) data['parse_tree_depth'] = np.array(parse_tree_depths).astype(np.int64) omit.add('parse_tree_depth') data['token_depth'] = np.array(token_depths).astype(np.int64) data['num_words_at_depth'] = np.array(num_words_at_depths).astype(np.int64) data.to_csv(path, sep='\t') return omit
def analyze_readability_measures(text, include_word_types=False): import readability import traceback from collections import OrderedDict tokenized = sentence_per_line_tokenize(text) res = pd.Series() tokenized_sentences = [ sent.split() for sent in tokenized.lower().split('\n') ] toks_flat = [w for sent in tokenized_sentences for w in sent] if len(toks_flat) == 0: return {} # invalid... res['mtld'] = mtld(toks_flat) try: readability_measures = readability.getmeasures(tokenized) except Exception: traceback.print_exc() else: for k, v in readability_measures.pop('sentence info').items(): res[k] = v num_words = res['words'] num_sents = res['sentences'] if include_word_types: for word_type, count in readability_measures.pop( 'sentence beginnings').items(): res[f'word_type_sent_startswith_{word_type}'] = count / num_sents for typ, count in readability_measures.pop('word usage').items(): res[f'word_type_overall_{typ}'] = count / num_words for k in 'wordtypes long_words complex_words'.split(): res[k] = res[k] / num_words # res.update(readability_measures.pop('readability grades')) return res
def generate(text, debug=False): """ Generates the following readability scores for a user's tweets: - Automated Readability Index (R_ARI) - Coleman-Liau Index (R_COL) - Flesch Reading Ease (R_FRE) - Gunning-Fog Index (R_GUN) - Kincaid Grade Level (R_KIN) - LIX (R_LIX) - SMOG Grade (R_SMOG) :param text: a String containing all tokenized sentences of a user, divided by newline characters (\n). :param debug: a Bool indicating if debugging information should be printed (default: False). :return: a Dict containing the feature names as keys and calculated lengths as values. """ measures = readability.getmeasures(text, lang='nl', merge=True) if debug: for key, value in measures.items(): print(key, ": ", value) return { "R_ARI": measures["ARI"], "R_COL": measures["Coleman-Liau"], "R_FRE": measures["FleschReadingEase"], "R_GUN": measures["GunningFogIndex"], "R_KIN": measures["Kincaid"], "R_LIX": measures["LIX"], "R_SMOG": measures["SMOGIndex"] }
def measure_file(srcpath, dstpath): print(srcpath) with open(srcpath, 'rt', encoding='utf-8') as srcfile: text = srcfile.read() for pat, repl in patterns: text = re.sub(pat, repl, text) results = readability.getmeasures(text, lang='en') measure = results['readability grades']['GunningFogIndex'] with open(dstpath, 'wt', encoding='utf-8') as dstfile: for key in results: print('\t', key, sep='', file=dstfile) value = results[key] for key2 in value: value2 = value[key2] if isinstance(value2, float): value2 = round(value2, 2) print('\t\t', f"{key2} {value2}", sep='', file=dstfile) dstfile.write(text) #os.remove(dstpath) return round(measure, 2)
def get_q2_readability_score(row): text = row['question2'] try: results = readability.getmeasures(text, lang='en') except: return 0.00 return results['readability grades']['FleschReadingEase']
def generate_candidates(bg, rd_th, rp_th, clist, max_len=10): ''' recurse me baby ''' candidate[bg] = {'rd': 0, 'rp': 0} if len(bg.split(' ')) > 1: candidate[bg]['rd'] = readability.getmeasures( bg, lang='en')['readability grades']['FleschReadingEase'] candidate[bg]['rp'] = representativeness(bg) gen_list = [i for i in clist] if candidate[bg]['rd'] < rd_th or candidate[bg]['rp'] < rp_th or len( bg.split(' ')) > max_len: return else: cl = [] for i in gen_list: tmp = i.split(' ') tmp.reverse() val = ' '.join(bg.split(' ')[-2:]) if bg.split(' ')[-1] == tmp[1] and val != ' '.join(tmp): cl.append(bg + ' ' + ' '.join(i.split(' ')[1:])) for gram in cl: generate_candidates(gram, 0.1, 0, clist)
def main(): post_path: str = sys.argv[1] print('-' * 80) print('post: {}'.format(post_path)) print('-' * 80) with open(post_path) as f_in: data = json.load(f_in) content: str = data["content"] soup = BeautifulSoup(content, features="lxml") # Last line is at the bottom of the article before footnotes, so delete # all footnotes last_line: bs4.element.Tag = soup.find_all("hr")[-1] for elem in last_line.find_all_next(): elem.decompose() paragraphs = soup.find_all('p') # Delete footnote links for paragraph in paragraphs: for footnote_link in paragraph.find_all('sup'): footnote_link.decompose() paragraph_strings: List[str] = [ p.get_text().replace('\n', ' ') for p in paragraphs ] with open("/tmp/test/2fa.txt", "w") as f_out: for paragraph_string in paragraph_strings: f_out.write("{}\n\n".format(paragraph_string)) nlp = English() sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) sentences: List[str] = [ sentence for paragraph_string in paragraph_strings for sentence in get_sentences(paragraph_string, nlp) ] readability_results = readability.getmeasures(sentences, lang='en') pprint.pprint(readability_results['readability grades']) for (key, value) in readability_results['readability grades'].items(): if key in thresholds_lookup: thresholds = thresholds_lookup[key] for i, pair in enumerate(zip(thresholds, thresholds[1:])): if (value >= pair[0] and value <= pair[1]) or (pair[0] >= value and pair[1] <= value): if i == 0: print(colored("{}: {:.2f}".format(key, value), "green")) elif i == 1: print( colored("{}: {:.2f}".format(key, value), "yellow")) else: print(colored("{}: {:.2f}".format(key, value), "red"))
def readability_stats(text): stats = readability.getmeasures(text, lang='en') a = stats['sentence info']['words'] b = stats['sentence info']['syll_per_word'] c = stats['sentence info']['syllables'] d = stats['sentence info']['long_words'] e = stats['sentence info']['complex_words'] return a, b, c, d, e
def get_score(text): print("------------READABILITY -------------------------------------") results = rd.getmeasures(text, lang='en') mediane, taux_accord = calcul_readability(results) print( mediane, "(", taux_accord, ")\n-------------------------------------------------------------") return mediane, taux_accord
def text_readability(self, text): """ Calculate the readability for a given text. :return: readability measure :rtype: float """ results = readability.getmeasures(text, lang='en') return results['readability grades']['FleschReadingEase']
def post_text(): """receives POST req from node server, runs readability method, responds with jsonified dict""" text = request.json['content'] results = readability.getmeasures(text, lang='en') # print('RATING \n', results['readability grades']) # print(text['content'], 'TEXT') return jsonify(results)
def all_readbility_measures(text): list_of_all_measures = [ readability.getmeasures(t, lang="en")['readability grades'] for t in text ] df = pd.DataFrame.from_records(list_of_all_measures) return df
def compute_readability(text, length_normalize=False): # readability.getmeasures(text) R = readability.getmeasures(text.decode('unicode-escape'), lang=u'en', merge=True) score = R["Kincaid"] # Flesh - Kincaid score if length_normalize: score = score / float(len(text.split())) return score
def get_read_measure(self): value_list = [] for cat, data in list(readability.getmeasures(self._sentence_data, lang='en').items()): print(('%s:' % cat)) for key, val in list(data.items()): print(((' %-20s %12.2f' % (key + ':', val)).rstrip('0 ').rstrip('.'))) value_list.append(val) return val
def extract_readability_features(text): text = re.sub(r'\.', '.\n', text) text = re.sub(r'\?', '?\n', text) text = re.sub(r'!', '!\n', text) features = dict(readability.getmeasures(text, lang='en')) result = {} for d in features: result.update(features[d]) result = pd.Series(result) return result
def readability_stats(dataframe, row, i, current_column, new_column, readability_group, readability_measure): this_comment = row[current_column] tokenized = '\n\n'.join('\n'.join(' '.join(token.value for token in sentence) for sentence in paragraph) for paragraph in segmenter.analyze(this_comment)) this_result = readability.getmeasures(tokenized, lang='en') c.df[dataframe].at[ i, new_column] = this_result[readability_group][readability_measure]
def get_readability(text): """ return readability Length of the sentence, in words and characters; Flesch-Kincaid Grade Level score. """ blob = TextBlob(text) results = readability.getmeasures(text, lang='en') return [ len(blob.words), round(results['readability grades']['FleschReadingEase'], 2) ]
def extract_readability_features(text): text = re.sub(r"\.", ".\n", text) text = re.sub(r"\?", "?\n", text) text = re.sub(r"!", "!\n", text) features = dict(readability.getmeasures(text, lang="en")) result = {} for d in features: result.update(features[d]) del result["paragraphs"] result = pd.Series(result) return result
def infer_readability(text): """ """ if text is None or len(text) == 0: return None try: measures = readability.getmeasures(text) except: return None return measures
def _get_score(self, preprocessed_summary, dataset, score): flesch_score = [] for i in range(len(preprocessed_summary)): temp_sentence_list = tokenize.sent_tokenize(dataset[i, 0]) temp_score_list = [] for j in range(len(temp_sentence_list)): if(any(c.isalpha() for c in temp_sentence_list[j])): results = readability.getmeasures(temp_sentence_list[j], lang='en') temp_score_list.append(results['readability grades'][score]) flesch_score.append(min(temp_score_list)) return np.array(flesch_score).reshape(-1,1)
def difficulty(df): """ Calculates the difficulty of the song's lyrics. Intended to be used with an apply function. :param df: Dataframe of the potential one-hit wonders :return: The readability score """ if df['foreign_language'] == 1: return np.nan else: text = df['lyric_difficulty'] results = readability.getmeasures(text, lang='en') return results['readability grades']['FleschReadingEase']
def informality_features(text, text_id=0, complexity=False): #for readability, sentences should be separated by '\n' results = readability.getmeasures(text, lang='en') informality_scores = [score[1] for score in results['readability grades'].items()] #reads the semanticComplexity from output_file.txt on line order matters as the id should match the line if complexity: complexity = pd.read_csv(cwd+"/res/complexity/output_file.csv") complexity_score = complexity.iloc[text_id].values.tolist() informality_scores.extend(complexity_score) return informality_scores
def master_text(text): #get extractive ex = generate_summary(text) #print("Finished Extractive Summary") #get abstractive fn = os.path.join(os.path.dirname(__file__), './story/a.story') with open(fn, 'w') as f: f.write(text) ab = abstractive_summary() #print("Finished Abstractive Summary") #get keywords kw, definitions = get_keywords(text) #print("Finished getting keywords") #other data tags = get_tags(text) #print("Finished getting tags") n = graph_sentiment(text) # print("Finished graphing sentiment") results = readability.getmeasures(text, lang='en') r = str(results['readability grades']['Kincaid'] / 2) t = float(r) if (t > 90): r += " - Very easy to read. Easily understood by an average 11-year-old student." elif (t > 80): r += " - Easy to read. Conversational English for consumers." elif (t > 70): r += " - Fairly easy to read" elif (t > 60): r += " - Plain English. Easily understood by 13- to 15-year-old students." elif (t > 50): r += " - Fairly difficult to read." elif (t > 30): r += " - Difficult to read." else: r += " - Very difficult to read. Best understood by university graduates." print([ex, ab, kw, definitions, tags, r, n]) return [ex, ab, kw, definitions, tags, r, n]
def add_features_to_df(X): rows_to_add = [] for _, row in X.iterrows(): text = row[TEXT_COLUMN_NAME] readability_features = getmeasures(text) row_to_add = {} row_to_add.update(readability_features['readability grades']) rows_to_add.append(row_to_add) df_to_add = pd.DataFrame(rows_to_add) X_merged = pd.concat([X, df_to_add], axis=1) return X_merged
def main(): features_data_file = 'data/allreadability.pickle' features_object = {1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: []} unwanted_features = [ 'paragraphs', 'words', 'characters', 'sentences_per_paragraph', 'words_per_sentence', ] final_array = None data_file_path = 'data/training_set_rel3.tsv' data = open(data_file_path, encoding="ISO-8859-1") lines = data.readlines() data.close() for index, line in enumerate(lines[1:]): if index % 50 == 0: print(f"processed {index} essays") tokens = line.strip().split('\t') essay_id = int(tokens[0]) essay_set = int(tokens[1]) content = tokens[2].strip() score = tokens[6] sent_tokens = text_tokenizer(content, replace_url_flag=True, tokenize_sent_flag=True) sentences = [' '.join(sent) + '\n' for sent in sent_tokens] sentences = ''.join(sentences) readability_scores = readability.getmeasures(sentences, lang='en') features = [essay_id] for cat in readability_scores.keys(): for subcat in readability_scores[cat].keys(): if subcat not in unwanted_features: ind_score = readability_scores[cat][subcat] features.append(ind_score) features_object[essay_set].append(features) for key in features_object.keys(): features_object[key] = np.array(features_object[key]) min_v, max_v = features_object[key].min( axis=0), features_object[key].max(axis=0) features = (features_object[key] - min_v) / (max_v - min_v) features = np.nan_to_num(features) features = features_object[key] features_object[key][:, 1:] = features[:, 1:] if isinstance(final_array, type(None)): final_array = features_object[key] else: final_array = np.vstack((final_array, features_object[key])) with open(features_data_file, 'wb') as fp: pickle.dump(final_array, fp)
def readability_study(txt_file_name): f = open(txt_file_name, "r") text = f.read() measures = readability.getmeasures(text, lang="en") for cat, data in measures.items(): print('%s:' % cat) for key, val in data.items(): print((' %-25s %12.2f' % (key + ':', val)).rstrip('0 ').rstrip('.')) return measures
def run_readability(texts): out = [] for text in texts: tokenized = '\n\n'.join( '\n'.join( ' '.join(token.value for token in sentence) for sentence in paragraph) for paragraph in segmenter.analyze(text)) results = readability.getmeasures(tokenized, lang='en') data = {} for key in results: data[key.replace(' ', '')] = dict(results[key]) out.append(data) return out
def analyze_text(self, text): tokenized = self.tokenize_text(text) measures = readability.getmeasures(tokenized, lang='en') return { 'grade_level': measures['readability grades']['Kincaid'], 'words': measures['sentence info']['words'], 'words_per_sentence': measures['sentence info']['words_per_sentence'], 'sentences_per_paragraph': measures['sentence info']['sentences_per_paragraph'], 'paragraphs': measures['sentence info']['paragraphs'] }
def readability_features(self): features = [] content = '.'.join([self.title,self.short,self.need,self.essay]) n_para = len(re.findall(r'\\n\\n', content)) measures = getmeasures(sent_detector.tokenize(content)+['']*n_para) grades = measures['readability grades'] features += grades.values() sent_info = measures['sentence info'] features += sent_info.values() #features += map(lambda k: sent_info[k],\ #['characters_per_word', 'syll_per_word', 'words_per_sentence', 'characters','syllables', 'words', 'sentences', 'long_words', 'complex_words']) word_usage = measures['word usage'] features += word_usage.values() sent_begin = measures['sentence beginnings'] features += sent_begin.values() return tuple(features)
def score_statements(filename=DEFAULT_FILENAME, loglevel=logging.INFO, database=DB_PATH): sia = SentimentIntensityAnalyzer() for i, statement in enumerate(Statement.objects.iterator()): s = sia.polarity_scores(statement.text) score = Score(positive=s['pos'], negative=s['neg'], neutral=s['neu'], compound=s['compound'], intensity=abs(s['compound'])) words = statement.text.split() if len(words) and any(words): superficial_measures = getmeasures(words) score.flesch = superficial_measures['readability grades']['FleschReadingEase'] score.kincaid = superficial_measures['readability grades']['Kincaid'] score.dale_chall = textstat.dale_chall_readability_score(statement.text) else: score.flesch = 0 score.kincaid = 0 score.dale_chall = 0 score.save() statement.score = score statement.save() print(statement.score) return i
def getreadabilitymeasures(numsents): """Get readability of all files and store results in a dictionary.""" try: import readability except ImportError: APP.logger.warning( 'readability module not found; install with:\npip install' ' https://github.com/andreasvc/readability/tarball/master') return {} files = glob.glob(os.path.join(CORPUS_DIR, '*.tok')) results = {} # consider a fixed number of sentences to get comparable results cutoff = min(numsents) for filename in sorted(files): name = os.path.basename(filename) # flatten results into a single dictionary of (key, value) pairs. results[name] = {key: value for data in readability.getmeasures( islice(io.open(filename, encoding='utf8'), cutoff), lang=LANG).values() for key, value in data.items()} return results