def __dale_chall(r: Readability) -> float: try: lvls = r.dale_chall().grade_levels if 'college_graduate' in lvls: return 17 elif 'college' in lvls: return 13 else: return stat.mean([float(lvl) for lvl in lvls]) except ReadabilityException: return None
def doc_to_readability(doc_str) -> ArrayField: if len(doc_str) < 10: return ArrayField(np.zeros(7)) str_to_read = doc_str try: while len(str_to_read.split()) < 150: str_to_read += " " + doc_str r = Readability(str_to_read) r_scores = [ r.flesch_kincaid().score, r.flesch().score, r.gunning_fog().score, r.coleman_liau().score, r.dale_chall().score, r.ari().score, r.linsear_write().score ] return ArrayField(np.array(r_scores)) except ReadabilityException: return ArrayField(np.zeros(7))
#------------------ # Readability #------------------ st.header('Readability') # Context passage = st.text_area("Candidate Bible Passage (English)", value='', max_chars=None, key='readability_passage') # Calculate readability r = Readability(passage) # Display readability data = [ ['Flesch-Kincaid Score', r.flesch_kincaid().score], ['Flesch Reading Ease', r.flesch().ease], ['Dale Chall Readability Score', r.dale_chall().score], ['Automated Readability Index Score', r.ari().score], ['Coleman Liau Index', r.coleman_liau().score], ['Gunning Fog', r.gunning_fog().score], ['Linsear Write', r.linsear_write().score], ['Spache Readability Formula', r.spache().score] ] df = pd.DataFrame(data, columns=['Readability Metric', 'Value']) if st.button('Assess Readability', key=None): st.write(df)
#to see number of possible Permutations(different combinations of list enteries without any repetitions) #e.g in case of A,B,C . Permutations ABC, ACB, BAC,BCA, CAB, CBA i.e. 6 combinations for 3 columns/list enteries num_combinations=math.factorial(num_columns) """Finding All Cut Up Permutations and Storage in a list """ permutations_object = itertools.permutations(cutup_pieces_list) #Find permutations of a_list. permutations_list = list(permutations_object) #Create list from permutations. permuted_strings_list=["".join(tup) for tup in permutations_list] #print(permuted_strings_list) readability_index_list=[] for i in range(num_combinations): r = Readability(permuted_strings_list[i]) dc = r.dale_chall() readability_index_list.append(dc.score) #print(dc.grade_levels) #Finding index of list with highest readability score to extract text which is most readable index=readability_index_list.index(max(readability_index_list)) print(permuted_strings_list[index]) # print(permuted_strings_list[i]) """Dale Chall Readability The Dale-Chall Formula is an accurate readability formula for the simple reason that it is based on the use of familiar words, rather than syllable or letter counts. Reading tests show that readers usually find it easier to read, process and recall a passage if they find the words familiar."""
class ReadabilityTest(unittest.TestCase): def setUp(self): text = """ In linguistics, the Gunning fog index is a readability test for English writing. The index estimates the years of formal education a person needs to understand the text on the first reading. For instance, a fog index of 12 requires the reading level of a United States high school senior (around 18 years old). The test was developed in 1952 by Robert Gunning, an American businessman who had been involved in newspaper and textbook publishing. The fog index is commonly used to confirm that text can be read easily by the intended audience. Texts for a wide audience generally need a fog index less than 12. Texts requiring near-universal understanding generally need an index less than 8. """ self.readability = Readability(text) def test_ari(self): r = self.readability.ari() print(r) self.assertEqual(9.551245421245422, r.score) self.assertEqual(['10'], r.grade_levels) self.assertEqual([15, 16], r.ages) def test_coleman_liau(self): r = self.readability.coleman_liau() print(r) self.assertEqual(10.673162393162393, r.score) self.assertEqual('11', r.grade_level) def test_dale_chall(self): r = self.readability.dale_chall() print(r) self.assertEqual(9.32399010989011, r.score) self.assertEqual(['college'], r.grade_levels) def test_flesch(self): r = self.readability.flesch() print(r) self.assertEqual(51.039230769230784, r.score) self.assertEqual(['10', '11', '12'], r.grade_levels) self.assertEqual('fairly_difficult', r.ease) def test_flesch_kincaid(self): r = self.readability.flesch_kincaid() print(r) self.assertEqual(10.125531135531137, r.score) self.assertEqual('10', r.grade_level) def test_gunning_fog(self): r = self.readability.gunning_fog() print(r) self.assertEqual(12.4976800976801, r.score) self.assertEqual('12', r.grade_level) def test_linsear_write(self): r = self.readability.linsear_write() print(r) self.assertEqual(11.214285714285714, r.score) self.assertEqual('11', r.grade_level) def test_smog(self): text = """ In linguistics, the Gunning fog index is a readability test for English writing. The index estimates the years of formal education a person needs to understand the text on the first reading. For instance, a fog index of 12 requires the reading level of a United States high school senior (around 18 years old). The test was developed in 1952 by Robert Gunning, an American businessman who had been involved in newspaper and textbook publishing. The fog index is commonly used to confirm that text can be read easily by the intended audience. Texts for a wide audience generally need a fog index less than 12. Texts requiring near-universal understanding generally need an index less than 8. """ text = ' '.join(text for i in range(0, 5)) readability = Readability(text) r = readability.smog() print(r) self.assertEqual(12.516099999999998, r.score) self.assertEqual('13', r.grade_level) def test_spache(self): r = self.readability.spache() print(r) self.assertEqual(7.164945054945054, r.score) self.assertEqual('7', r.grade_level) def test_print_stats(self): stats = self.readability.statistics() self.assertEqual(562, stats['num_letters']) self.assertEqual(117, stats['num_words']) self.assertEqual(7, stats['num_sentences']) self.assertEqual(20, stats['num_polysyllabic_words'])
class ReadabilityTest(unittest.TestCase): def setUp(self): text = """ “On a June day sometime in the early 1990s, encouraged by his friend and fellow economist Jörgen Weibull, Abhijit went swimming in the Baltic. He leaped in and instantly jumped out—he claims that his teeth continued to chatter for the next three days. In 2018, also in June, we went to the Baltic in Stockholm, several hundred miles farther north than the previous encounter. This time it was literally child’s play; our children frolicked in the water. Wherever we went in Sweden, the unusually warm weather was a topic of conversation. It was probably a portent of something everyone felt, but for the moment it was hard not to be quite delighted with the new opportunities for outdoor life it offered.”. """ self.readability = Readability(text) def test_ari(self): r = self.readability.ari() print(r) self.assertEqual(9.551245421245422, r.score) self.assertEqual(['10'], r.grade_levels) self.assertEqual([15, 16], r.ages) def test_coleman_liau(self): r = self.readability.coleman_liau() print(r) self.assertEqual(10.673162393162393, r.score) self.assertEqual('11', r.grade_level) def test_dale_chall(self): r = self.readability.dale_chall() print(r) self.assertEqual(9.32399010989011, r.score) self.assertEqual(['college'], r.grade_levels) def test_flesch(self): r = self.readability.flesch() print(r) self.assertEqual(51.039230769230784, r.score) self.assertEqual(['10', '11', '12'], r.grade_levels) self.assertEqual('fairly_difficult', r.ease) def test_flesch_kincaid(self): r = self.readability.flesch_kincaid() print(r) self.assertEqual(10.125531135531137, r.score) self.assertEqual('10', r.grade_level) def test_gunning_fog(self): r = self.readability.gunning_fog() print(r) self.assertEqual(12.4976800976801, r.score) self.assertEqual('12', r.grade_level) def test_linsear_write(self): r = self.readability.linsear_write() print(r) self.assertEqual(11.214285714285714, r.score) self.assertEqual('11', r.grade_level) def test_smog(self): text = """ “On a June day sometime in the early 1990s, encouraged by his friend and fellow economist Jörgen Weibull, Abhijit went swimming in the Baltic. He leaped in and instantly jumped out—he claims that his teeth continued to chatter for the next three days. In 2018, also in June, we went to the Baltic in Stockholm, several hundred miles farther north than the previous encounter. This time it was literally child’s play; our children frolicked in the water. Wherever we went in Sweden, the unusually warm weather was a topic of conversation. It was probably a portent of something everyone felt, but for the moment it was hard not to be quite delighted with the new opportunities for outdoor life it offered.”. """ text = ' '.join(text for i in range(0, 5)) readability = Readability(text) #Test SMOG with 30 sentences r1 = readability.smog() #Test SMOG with all sentences r2 = readability.smog(all_sentences=True) print("all_sentences=False: %s ; all_sentences=True: %s" % (r1, r2)) self.assertEqual(12.516099999999998, r1.score) self.assertEqual('13', r1.grade_level) self.assertEqual(12.785403640627713, r2.score) self.assertEqual('13', r2.grade_level) def test_spache(self): r = self.readability.spache() print(r) self.assertEqual(7.164945054945054, r.score) self.assertEqual('7', r.grade_level) def test_print_stats(self): stats = self.readability.statistics() self.assertEqual(562, stats['num_letters']) self.assertEqual(117, stats['num_words']) self.assertEqual(7, stats['num_sentences']) self.assertEqual(20, stats['num_polysyllabic_words'])
class ReadabilityAnalyser: def __init__(self, text): self.readability = Readability(text) self.FLESCH_KINCAID = ['score', 'grade_level'] self.FLESCH_EASE = ['score', 'ease', 'grade_level'] self.DALE_CHALL = ['score', 'grade_level'] self.ARI = ['score', 'grade_level', 'ages'] self.CLI = ['score', 'grade_level'] self.GUNNING_FOG = ['score', 'grade_level'] self.SMOG = ['score', 'grade_level'] self.SPACHE = ['score', 'grade_level'] self.LINSEAR_WRITE = ['score', 'grade_level'] self.values_index = self.initialize_value_index_array() def initialize_value_index_array(self): values_index = dict() values_index["flesch_kincaid"] = self.FLESCH_KINCAID values_index["flesch_ease"] = self.FLESCH_EASE values_index["dale_chall"] = self.DALE_CHALL values_index["ari"] = self.ARI values_index["cli"] = self.CLI values_index["gunning_fog"] = self.GUNNING_FOG values_index["smog_all"] = self.SMOG values_index["smog"] = self.SMOG values_index["spache"] = self.SPACHE values_index["linsear_write"] = self.LINSEAR_WRITE return values_index def flesch_kincaid(self, content, error_ignore=True): try: record = dict() fk = self.readability.flesch_kincaid() record['score'] = fk.score record['grade_level'] = fk.grade_level content["flesch_kincaid"] = record except ReadabilityException as e: if not error_ignore: content["flesch_kincaid"] = str(e) print(e) def flesch_ease(self, content, error_ignore=True): try: record = dict() flesch_ease = self.readability.flesch() record['score'] = flesch_ease.score record['ease'] = flesch_ease.ease record['grade_levels'] = flesch_ease.grade_levels content['flesch_ease'] = record except ReadabilityException as e: if not error_ignore: content['flesch_ease'] = str(e) print(e) def dale_chall(self, content, error_ignore=True): try: record = dict() dale_chall = self.readability.dale_chall() record['score'] = dale_chall.score record['grade_level'] = dale_chall.grade_levels content['dale_chall'] = record except ReadabilityException as e: if not error_ignore: content['dale_chall'] = str(e) print(e) def automated_readability_index(self, content, error_ignore=True): try: record = dict() ari = self.readability.ari() record['score'] = ari.score record['grade_level'] = ari.grade_levels record['ages'] = ari.ages content['ari'] = record except ReadabilityException as e: if not error_ignore: content['ari'] = str(e) print(e) def coleman_liau_index(self, content, error_ignore=True): try: record = dict() coleman_liau = self.readability.coleman_liau() record['score'] = coleman_liau.score record['grade_level'] = coleman_liau.grade_level content['cli'] = record print(record) except ReadabilityException as e: print(e) if not error_ignore: content['cli'] = str(e) print(e) def gunning_fog_index(self, content, error_ignore=True): try: record = dict() gunning_fog = self.readability.gunning_fog() record['score'] = gunning_fog.score record['grade_level'] = gunning_fog.grade_level content['gunning_fog'] = record except ReadabilityException as e: if not error_ignore: content['gunning_fog'] = str(e) print(e) def smog(self, content, all_sentences=False, error_ignore=True): record = dict() try: if all_sentences: smog = self.readability.smog(all_sentences=all_sentences) record['score'] = smog.score record['grade_level'] = smog.grade_level content['smog_all'] = record else: smog = self.readability.smog() record['score'] = smog.score record['grade_level'] = smog.grade_level content['smog'] = record except ReadabilityException as e: print(e) print(error_ignore) if not error_ignore: if all_sentences: content['smog_all'] = str(e) else: content['smog'] = str(e) print(e) def spache_readability_formula(self, content, error_ignore=True): try: record = dict() spache = self.readability.spache() record['score'] = spache.score record['grade_level'] = spache.grade_level content['spache'] = record except ReadabilityException as e: if not error_ignore: content['spache'] = str(e) print(e) def linsear_write(self, content, error_ignore=True): try: record = dict() linsear_write = self.readability.linsear_write() record['score'] = linsear_write.score record['grade_level'] = linsear_write.grade_level content['linsear_write'] = record except ReadabilityException as e: if not error_ignore: content['linsear_write'] = str(e) print(e) @staticmethod def check_readability_from_file(input_json_file, output_json_file): result = [] json_file = load_as_json(input_json_file) for record in json_file: analyser = ReadabilityAnalyser(record['text']) analysed_file_record = dict() analysed_file_record['file'] = record['file'] analysed_file_record['category'] = record['category'] analyser.flesch_kincaid(analysed_file_record) analyser.flesch_ease(analysed_file_record) analyser.dale_chall(analysed_file_record) analyser.automated_readability_index(analysed_file_record) analyser.coleman_liau_index(analysed_file_record) analyser.gunning_fog_index(analysed_file_record) analyser.smog(analysed_file_record) analyser.smog(analysed_file_record, True) analyser.spache_readability_formula(analysed_file_record) analyser.linsear_write(analysed_file_record) result.append(analysed_file_record) save_as_json(result, output_json_file) def check_readability(self, use_methods=None, errors_included=True): result_analysis = dict() if use_methods is None or 'flesch_kincaid' in use_methods: self.flesch_kincaid(result_analysis, error_ignore=not errors_included) if use_methods is None or 'flesch_ease' in use_methods: self.flesch_ease(result_analysis, error_ignore=not errors_included) if use_methods is None or 'dale_chall' in use_methods: self.dale_chall(result_analysis, error_ignore=not errors_included) if use_methods is None or 'ari' in use_methods: self.automated_readability_index(result_analysis, error_ignore=not errors_included) if use_methods is None or 'cli' in use_methods: self.coleman_liau_index(result_analysis, error_ignore=not errors_included) if use_methods is None or 'gunning_fog' in use_methods: self.gunning_fog_index(result_analysis, error_ignore=not errors_included) if use_methods is None or 'smog' in use_methods: self.smog(result_analysis, error_ignore=not errors_included) if use_methods is None or 'smog_all' in use_methods: self.smog(result_analysis, True, error_ignore=not errors_included) if use_methods is None or 'spache' in use_methods: self.spache_readability_formula(result_analysis, error_ignore=not errors_included) if use_methods is None or 'linsear_write' in use_methods: self.linsear_write(result_analysis, error_ignore=not errors_included) return result_analysis @staticmethod def initialize_basic_dict(categories, values, process_category=True): record = dict() for value in values: record = ReadabilityAnalyser.initialize_dict(record, value) if process_category: for category in categories: record[category] = ReadabilityAnalyser.initialize_basic_dict( categories, values, False) return record @staticmethod def initialize_dict(record, value): record['min_' + value] = 999999999 record['max_' + value] = -999999999 record['sum_' + value] = 0 record['avg_' + value] = 0 record['freq_' + value] = 0 record['skipped_' + value] = 0 return record def initialize_values(self, statistic, categories): statistic["flesch_kincaid"] = self.initialize_basic_dict( categories, self.FLESCH_KINCAID) statistic["flesch_ease"] = self.initialize_basic_dict( categories, self.FLESCH_EASE) statistic["dale_chall"] = self.initialize_basic_dict( categories, self.DALE_CHALL) statistic["ari"] = self.initialize_basic_dict(categories, self.ARI) statistic["cli"] = self.initialize_basic_dict(categories, self.CLI) statistic["gunning_fog"] = self.initialize_basic_dict( categories, self.GUNNING_FOG) statistic["smog_all"] = self.initialize_basic_dict( categories, self.SMOG) statistic["smog"] = self.initialize_basic_dict(categories, self.SMOG) statistic["spache"] = self.initialize_basic_dict( categories, self.SPACHE) statistic["linsear_write"] = self.initialize_basic_dict( categories, self.LINSEAR_WRITE) statistic['indexes'] = [ "flesch_kincaid", "flesch_ease", "dale_chall", "ari", "cli", "gunning_fog", "smog_all", "smog", "spache", "linsear_write" ] statistic['categories'] = categories @staticmethod def fill_min_max_sum_category(index, value_index, statistics, readability_index, category): if index[value_index] < statistics[readability_index][category][ 'min_' + value_index]: statistics[readability_index][category][ 'min_' + value_index] = index[value_index] if index[value_index] > statistics[readability_index][category][ 'max_' + value_index]: statistics[readability_index][category][ 'max_' + value_index] = index[value_index] statistics[readability_index][category]['sum_' + value_index] = \ statistics[readability_index][category]['sum_' + value_index] + index[value_index] @staticmethod def fill_min_max_sum_category_value(value, value_index, statistics, readability_index, category): if value < statistics[readability_index][category]['min_' + value_index]: statistics[readability_index][category]['min_' + value_index] = value if value > statistics[readability_index][category]['max_' + value_index]: statistics[readability_index][category]['max_' + value_index] = value statistics[readability_index][category]['sum_' + value_index] = \ statistics[readability_index][category]['sum_' + value_index] + value @staticmethod def fill_min_max_sum(index, value_index, statistics, readability_index): if index[value_index] < statistics[readability_index]['min_' + value_index]: statistics[readability_index]['min_' + value_index] = index[value_index] if index[value_index] > statistics[readability_index]['max_' + value_index]: statistics[readability_index]['max_' + value_index] = index[value_index] statistics[readability_index]['sum_' + value_index] = \ statistics[readability_index]['sum_' + value_index] + index[value_index] @staticmethod def fill_min_max_sum_value(value, value_index, statistics, readability_index): if value < statistics[readability_index]['min_' + value_index]: statistics[readability_index]['min_' + value_index] = value if value > statistics[readability_index]['max_' + value_index]: statistics[readability_index]['max_' + value_index] = value statistics[readability_index]['sum_' + value_index] = \ statistics[readability_index]['sum_' + value_index] + value @staticmethod def cast_to_float(value): try: return float(value) except ValueError: return None except TypeError: return None def record_analysis(self, record, statistics): for readability_index in statistics['indexes']: if 'category' in record: category = record['category'] if readability_index in record: index = record[readability_index] for value_index in self.values_index[readability_index]: if value_index in index: obtained_value = ReadabilityAnalyser.cast_to_float( index[value_index]) if obtained_value is not None: index[value_index] = obtained_value ReadabilityAnalyser.fill_min_max_sum_category( index, value_index, statistics, readability_index, category) if 'freq_' + value_index not in statistics[ readability_index][category]: statistics[readability_index][category][ 'freq_' + value_index] = 0 statistics[readability_index][category]['freq_' + value_index] = \ statistics[readability_index][category]['freq_' + value_index] + 1 elif isinstance(index[value_index], list): for rec in index[value_index]: if value_index not in statistics[ readability_index][category]: statistics[readability_index][ category][value_index] = dict() if isinstance(rec, str): if 'freq_' + rec not in statistics[ readability_index][category][ value_index]: statistics[readability_index][ category][value_index]['freq_' + rec] = 0 statistics[readability_index][category][value_index]['freq_' + rec] = \ statistics[readability_index][category][value_index]['freq_' + rec] + 1 else: ReadabilityAnalyser.fill_min_max_sum_category_value( rec, value_index, statistics, readability_index, category) if 'freq_' + value_index not in \ statistics[readability_index][category][value_index]: statistics[readability_index][ category]['freq_' + value_index] = 0 statistics[readability_index][category]['freq_' + value_index] = \ statistics[readability_index][category]['freq_' + value_index] + 1 elif isinstance(index[value_index], str): rec = index[value_index] if value_index not in statistics[ readability_index][category]: statistics[readability_index][category][ value_index] = dict() if 'freq_' + rec not in statistics[ readability_index][category][ value_index]: statistics[readability_index][category][ value_index]['freq_' + rec] = 0 statistics[readability_index][category][value_index]['freq_' + rec] = \ statistics[readability_index][category][value_index]['freq_' + rec] + 1 else: print("Uncategorized: " + str(index[value_index])) statistics[readability_index][category]['freq_' + value_index] = \ statistics[readability_index][category]['freq_' + value_index] + 1 else: statistics[readability_index][category]['skipped_' + value_index] = \ statistics[readability_index][category]['skipped_' + value_index] + 1 else: for value_index in self.values_index[readability_index]: statistics[readability_index][category]['skipped_' + value_index] = \ statistics[readability_index][category]['skipped_' + value_index] + 1 else: print("THIS: " + record) if readability_index in record: index = record[readability_index] for value_index in self.values_index[readability_index]: if value_index in index: obtained_value = ReadabilityAnalyser.cast_to_float( index[value_index]) if obtained_value is not None: index[value_index] = float(index[value_index]) ReadabilityAnalyser.fill_min_max_sum( index, value_index, statistics, readability_index) if 'freq_' + value_index not in statistics[ readability_index]: statistics[readability_index]['freq_' + value_index] = 0 statistics[readability_index]['freq_' + value_index] = \ statistics[readability_index]['freq_' + value_index] + 1 elif isinstance(index[value_index], list): for rec in index[value_index]: if value_index not in statistics[ readability_index]: statistics[readability_index][ value_index] = dict() if isinstance(rec, str): # print(value_index + " " + str(index[value_index])) if 'freq_' + rec not in statistics[ readability_index][value_index]: statistics[readability_index][ value_index]['freq_' + rec] = 0 statistics[readability_index][value_index]['freq_' + rec] = \ statistics[readability_index][value_index]['freq_' + rec] + 1 else: ReadabilityAnalyser.fill_min_max_sum_value( rec, value_index, statistics, readability_index) if 'freq_' + value_index not in statistics[ readability_index][value_index]: statistics[readability_index][ 'freq_' + value_index] = 0 statistics[readability_index]['freq_' + value_index] = \ statistics[readability_index]['freq_' + value_index] + 1 elif isinstance(index[value_index], str): rec = index[value_index] if value_index not in statistics[ readability_index]: statistics[readability_index][ value_index] = dict() if 'freq_' + rec not in statistics[ readability_index][value_index]: statistics[readability_index][value_index][ 'freq_' + rec] = 0 statistics[readability_index][value_index]['freq_' + rec] = \ statistics[readability_index][value_index]['freq_' + rec] + 1 else: print("Uncategorized: " + str(index[value_index])) statistics[readability_index]['freq_' + value_index] = \ statistics[readability_index]['freq_' + value_index] + 1 else: statistics[readability_index]['skipped_' + value_index] = \ statistics[readability_index]['skipped_' + value_index] + 1 else: for value_index in self.values_index[readability_index]: statistics[readability_index]['skipped_' + value_index] = \ statistics[readability_index]['skipped_' + value_index] + 1 def count_average(self, statistics): for readability_index in statistics['indexes']: for value_index in self.values_index[readability_index]: if statistics[readability_index]['sum_' + value_index] != 0: statistics[readability_index]['avg_' + value_index] = \ statistics[readability_index]['sum_' + value_index] / statistics[readability_index][ 'freq_' + value_index] else: statistics[readability_index]['sum_' + value_index] = 0 for category in statistics['categories']: if statistics[readability_index][category][ 'sum_' + value_index] != 0: statistics[readability_index][category]['avg_' + value_index] = \ statistics[readability_index][category]['sum_' + value_index] / \ statistics[readability_index][category]['freq_' + value_index] else: statistics[readability_index][category][ 'sum_' + value_index] = 0 def analyse_readability_file(self, readability_file, categories): statistic = dict() self.initialize_values(statistic, categories) file = load_as_json(readability_file) for record in file: self.record_analysis(record, statistic) self.count_average(statistic) return statistic def analyse_readability_file_save_results(self, readability_file, output_statistics_file, categories): statistics = self.analyse_readability_file(readability_file, categories) save_as_json(statistics, output_statistics_file)
l_ari = [] l_linsear_write = [] l_spache = [] l_flesch_ease = [] for i in os.listdir(PATH): if not i.startswith('.'): if i not in l_not_use: with open(PATH + i, 'r') as f: text = f.read() r = Readability(clean(text)) s1 = r.flesch_kincaid() s2 = r.flesch() s3 = r.gunning_fog() s4 = r.coleman_liau() s5 = r.dale_chall() s6 = r.ari() s7 = r.linsear_write() # r.smog() s8 = r.spache() l_flesch_kincaid.append(s1.score) l_flesch.append(s2.score) l_flesch_ease.append(s2.ease) l_gunning_fog.append(s3.score) l_coleman_liau.append(s4.score) l_dale_chall.append(s5.score) l_ari.append(s6.score) l_linsear_write.append(s7.score) l_spache.append(s8.score) """ -------------------------------------------------------------------------------------------------------
def generate_caption_stats(dataframe: pd.DataFrame, pos_tag_stats: bool = True, readability_scores: bool = True, n_spacy_workers: int = 6, spacy_model: str = "en_core_web_lg", backend: MetadataGeneratorBackend = MetadataGeneratorBackend.SPACY): logger.info(f"Generating caption statistics using {backend.upper()}...") start = time.time() # Tokens and sentences num_tok = [] num_sent = [] # Min and Max length of sentences min_sent_len = [] max_sent_len = [] # Named Entities num_ne = [] ne_texts = [] # surface form of the NEs ne_types = [] # types of the NEs # POS Tags # counts num_noun = [] # nouns (cat, dog, house, tree, ...) num_propn = [] # proper nouns (Denver, Hamburg, Peter, Tesla, ...) num_conj = [] # conjunctions (and, or, ...) num_verb = [] # verbs num_sym = [] # symbols (!,#,?, ...) num_num = [] # numbers (IV, 1 billion, 1312, ...) num_adp = [] # adpositions (on, under, in, at, ...) num_adj = [] # adjectives (nice, fast, cool, ...) # ratios ratio_ne_tokens, num_ne_tok = [], [] ratio_noun_tokens = [] ratio_propn_tokens = [] ratio_all_noun_tokens = [] # readability scores fk_gl_score = [] fk_re_score = [] dc_score = [] with tqdm(total=len(dataframe)) as pbar: # TODO extract all of this code into an own module and have separate metadata generators for spaCy, nltk, etc. if backend == MetadataGeneratorBackend.SPACY: # init spacy TODO: download the required model(s) spacy_nlp = spacy.load(spacy_model) if readability_scores: spacy_nlp.add_pipe(Readability()) # TODO whats a good batch_size? for doc in spacy_nlp.pipe(dataframe['caption'].astype(str), n_process=n_spacy_workers): # num tokens num_tok.append(len(doc)) # num sentences num_sent.append(len(list(doc.sents))) # min/max length of sentences min_len = 10000 max_len = -1 for s in doc.sents: min_len = min(min_len, len(s)) max_len = max(max_len, len(s)) min_sent_len.append(min_len) max_sent_len.append(max_len) # named entities num_ne.append(len(doc.ents)) txt, typ = [], [] for ent in doc.ents: typ.append(ent.label_) txt.append(ent.text) ne_texts.append(txt) ne_types.append(typ) # readability scores if readability_scores: fk_gl_score.append(doc._.flesch_kincaid_grade_level) fk_re_score.append(doc._.flesch_kincaid_reading_ease) dc_score.append(doc._.dale_chall) # POS Tags if pos_tag_stats: noun, propn, conj, verb, sym, num, adp, adj, ne_tok = 0, 0, 0, 0, 0, 0, 0, 0, 0 for t in doc: if t.pos_ == 'CONJ': conj += 1 elif t.pos_ == 'ADJ': adj += 1 elif t.pos_ == 'NOUN': noun += 1 elif t.pos_ == 'NUM': num += 1 elif t.pos_ == 'PROPN': propn += 1 elif t.pos_ == 'SYM': sym += 1 elif t.pos_ == 'VERB': verb += 1 elif t.pos_ == 'ADP': adp += 1 # number of tokens associated with a NE (to compute the ratio) if t.ent_iob_ == 'I' or t.ent_iob_ == 'B': ne_tok += 1 num_noun.append(noun) num_propn.append(propn) num_conj.append(conj) num_verb.append(verb) num_sym.append(sym) num_num.append(num) num_adp.append(adp) num_adj.append(adj) num_ne_tok.append(ne_tok) pbar.update(1) elif backend == MetadataGeneratorBackend.NLTK: nltk.download('punkt') nltk.download('words') nltk.download('averaged_perceptron_tagger') nltk.download('universal_tagset') nltk.download('universal_treebanks_v20') nltk.download('maxent_ne_chunker') for cap in dataframe['caption'].astype(str): # num tokens num_tok.append(len(nltk.word_tokenize(cap))) # num sentences sents = nltk.sent_tokenize(cap) num_sent.append(len(sents)) # min/max length of sentences min_len = 10000 max_len = -1 s_toks = [] for s in sents: toks = nltk.word_tokenize(s) s_toks.append(toks) min_len = min(min_len, len(toks)) max_len = max(max_len, len(toks)) min_sent_len.append(min_len) max_sent_len.append(max_len) # readability scores # FIXME currently not usable with NLTK... (because NaN values are dropped) # there is also an error while calling t Readability(cap) ctor... if False: try: r = Readability(cap) flesch = r.flesch_kincaid() fk_gl_score.append(flesch.grade_level) fk_re_score.append(flesch.score) dc_score.append(r.dale_chall().score) except (ReadabilityException, Exception): fk_gl_score.append(np.NaN) fk_re_score.append(np.NaN) dc_score.append(np.NaN) if pos_tag_stats: sent_pos_tags = nltk.pos_tag_sents(s_toks, 'universal') noun, propn, conj, verb, sym, num, adp, adj, ne_tok = 0, 0, 0, 0, 0, 0, 0, 0, 0 for spt in sent_pos_tags: for pt in spt: if pt[1].upper() == 'CONJ': conj += 1 elif pt[1].upper() == 'ADJ': adj += 1 elif pt[1].upper() == 'NOUN': noun += 1 elif pt[1].upper() == 'NUM': num += 1 elif pt[1].upper() == 'PROPN': propn += 1 elif pt[1].upper() == 'SYM': sym += 1 elif pt[1].upper() == 'VERB': verb += 1 elif pt[1].upper() == 'ADP': adp += 1 num_noun.append(noun) num_propn.append(propn) num_conj.append(conj) num_verb.append(verb) num_sym.append(sym) num_num.append(num) num_adp.append(adp) num_adj.append(adj) # named entities # we have to tag again with a different tag set (upenn tree) for WAY better NER performance num_nes, num_nes_tok = 0, 0 txt, typ = [], [] nes_sent = nltk.ne_chunk_sents(nltk.pos_tag_sents(map(nltk.word_tokenize, nltk.sent_tokenize(cap)))) for nes in nes_sent: for ne in nes: if isinstance(ne, nltk.Tree): num_nes += 1 typ.append(str(ne.label())) t = "" for tok in ne: t += tok[0] + " " txt.append(t.strip()) num_nes_tok += len(ne) num_ne.append(num_nes) ne_texts.append(txt) ne_types.append(typ) num_ne_tok.append(num_nes_tok) pbar.update(1) elif backend == MetadataGeneratorBackend.POLYGLOT: # init # pandarallel.initialize() # FIXME doens't work.. downloader.download("embeddings2.en") downloader.download("ner2.en") downloader.download("pos2.en") def __gen_polyglot_metadata_per_caption(df, pb): d = { 'num_tok': 0, 'num_sent': 0, 'min_sent_len': 0, 'max_sent_len': 0, 'num_ne': 0, 'ne_types': [], 'ne_texts': [], 'num_nouns': 0, 'num_propn': 0, 'num_conj': 0, 'num_verb': 0, 'num_sym': 0, 'num_num': 0, 'num_adp': 0, 'num_adj': 0, 'ratio_ne_tok': 0., 'ratio_noun_tok': 0., 'ratio_propn_tok': 0., 'ratio_all_noun_tok': 0., } try: caption = str(df['caption']).encode('utf-8') # https://github.com/aboSamoor/polyglot/issues/71 # removing "bad unicode" characters to avoid runtime exceptions # caption = str(caption, encoding='utf-8') caption = regex.sub(r"\p{C}", "", caption.decode('utf-8')) pg = Text(caption, hint_language_code='en') pg.language = 'en' # num tokens n_tok = len(pg.words) # num sentences n_sent = len(pg.sentences) # min/max length of sentences min_s_len = 10000 max_s_len = -1 for s in pg.sentences: min_s_len = min(min_s_len, len(s.words)) max_s_len = max(max_s_len, len(s.words)) # readability scores # FIXME only available with spacy currently # POS tags n_noun, n_propn, n_conj, n_verb, n_sym, n_num, n_adp, n_adj = 0, 0, 0, 0, 0, 0, 0, 0 for pos in pg.pos_tags: if pos[1].upper() == 'CONJ': n_conj += 1 elif pos[1].upper() == 'ADJ': n_adj += 1 elif pos[1].upper() == 'NOUN': n_noun += 1 elif pos[1].upper() == 'NUM': n_num += 1 elif pos[1].upper() == 'PROPN': n_propn += 1 elif pos[1].upper() == 'SYM': n_sym += 1 elif pos[1].upper() == 'VERB': n_verb += 1 elif pos[1].upper() == 'ADP': n_adp += 1 # named entities num_nes_tok, ne_txt, ne_typ = 0, [], [] num_nes = len(pg.entities) for ne in pg.entities: num_nes_tok += len(ne) ne_txt.append(" ".join(ne)) ne_typ.append(ne.tag) # compute the rations r_ne_tokens = num_nes_tok / n_tok r_noun_tokens = n_noun / n_tok r_propn_tokens = n_propn / n_tok r_all_noun_tokens = (n_noun + n_propn) / n_tok d = { 'num_tok': n_tok, 'num_sent': n_sent, 'min_sent_len': min_s_len, 'max_sent_len': max_s_len, 'num_ne': num_nes, 'ne_types': ne_typ, 'ne_texts': ne_txt, 'num_nouns': n_noun, 'num_propn': n_propn, 'num_conj': n_conj, 'num_verb': n_verb, 'num_sym': n_sym, 'num_num': n_num, 'num_adp': n_adp, 'num_adj': n_adj, 'ratio_ne_tok': r_ne_tokens, 'ratio_noun_tok': r_noun_tokens, 'ratio_propn_tok': r_propn_tokens, 'ratio_all_noun_tok': r_all_noun_tokens, } except Exception as e: logger.error(f"Critical error occurred with caption of WikiCaps ID{df['wikicaps_id']}!") logger.error(str(e)) return finally: pb.update(1) return d # FIXME why the hec is this using ALL AVAILABLE CORES?! metadata = dataframe.apply(__gen_polyglot_metadata_per_caption, axis=1, result_type='expand', args=(pbar,)) res = pd.concat([dataframe, metadata], axis=1) res.convert_dtypes() logger.info(f"Finished adding caption statistics in {time.time() - start} seconds!") return res # compute the rations if pos_tag_stats: np_num_tok = np.array(num_tok) np_num_noun = np.array(num_noun) np_num_propn = np.array(num_propn) ratio_ne_tokens = (np.array(num_ne_tok) / np_num_tok) ratio_noun_tokens = (np_num_noun / np_num_tok) ratio_propn_tokens = (np_num_propn / np_num_tok) ratio_all_noun_tokens = ((np_num_noun + np_num_propn) / np_num_tok) res = dataframe.copy() # add stats as columns to df res['num_tok'] = num_tok res['num_sent'] = num_sent res['min_sent_len'] = min_sent_len res['max_sent_len'] = max_sent_len res['num_ne'] = num_ne res['ne_types'] = ne_types res['ne_texts'] = ne_texts if pos_tag_stats: res['num_nouns'] = num_noun res['num_propn'] = num_propn res['num_conj'] = num_conj res['num_verb'] = num_verb res['num_sym'] = num_sym res['num_num'] = num_num res['num_adp'] = num_adp res['num_adj'] = num_adj res['ratio_ne_tok'] = ratio_ne_tokens res['ratio_noun_tok'] = ratio_noun_tokens res['ratio_propn_tok'] = ratio_propn_tokens res['ratio_all_noun_tok'] = ratio_all_noun_tokens if readability_scores: res['fk_re_score'] = fk_re_score res['fk_gl_score'] = fk_gl_score res['dc_score'] = dc_score res.convert_dtypes() # make sure that ints are not encoded as floats logger.info(f"Finished adding caption statistics in {time.time() - start} seconds!") return res