def get_fk_grade_level(text):
    # The text must contain at least 100 words
    if len(text.split()) < 100:
        result = "ERROR: This piece of text is too short to get a Flesch Kincaid grade level."
    else:
        # Instantiate a Readability object
        r = Readability(text)
        # Get the F-K score metric
        fk = r.flesch_kincaid()
        # Get the F-K grade level
        result = fk.grade_level
    return result
Exemple #2
0
 def doc_to_readability(doc_str) -> ArrayField:
     if len(doc_str) < 10:
         return ArrayField(np.zeros(7))
     str_to_read = doc_str
     try:
         while len(str_to_read.split()) < 150:
             str_to_read += " " + doc_str
         r = Readability(str_to_read)
         r_scores = [
             r.flesch_kincaid().score,
             r.flesch().score,
             r.gunning_fog().score,
             r.coleman_liau().score,
             r.dale_chall().score,
             r.ari().score,
             r.linsear_write().score
         ]
         return ArrayField(np.array(r_scores))
     except ReadabilityException:
         return ArrayField(np.zeros(7))
Exemple #3
0
    def run(self, book, **kwargs):
        doc = book.plaintext
        isbn = 'isbn' in book.metadata and book.metadata['isbn'][0]

        url = 'https://atlas-fab.lexile.com/free/books/' + str(isbn)

        headers = {'accept': 'application/json; version=1.0'}
        lexile = requests.get(url, headers=headers)
        # Checks if lexile exists for ISBN. If doesn't exist value remains 'None'.
        # If lexile does exist but no age range, value will be 'None'.
        # If no ISBN, value will be 'None'.
        if lexile.status_code == 200:
            lexile_work = lexile.json()['data']['work']
            self.lexile_min_age = str(lexile_work['min_age'])
            self.lexile_max_age = str(lexile_work['max_age'])
        try:
            r = Readability(doc)
            fk = r.flesch_kincaid()
            s = r.smog()
            self.readability_fk_score = fk.score
            self.readability_s_score = s.score
        # If less than 100 words
        except ReadabilityException:
            pass
Exemple #4
0
import requests
from bs4 import BeautifulSoup
from readability import Readability

response = requests.get('http://127.0.0.1/demo')

rtext = "The result is a number that corresponds with a U.S. grade level. The sentence, The Australian platypus is seemingly a hybrid of a mammal and reptilian creature is an 11.3 as it has 24 syllables and 13 words. The different weighting factors for words per sentence and syllables per word in each scoring system mean that the two schemes are not directly comparable and cannot be converted. The grade level formula emphasises sentence length over word length. By creating one-word strings with hundreds of random characters, grade levels may be attained that are hundreds of times larger than high school completion in the United States. Due to the formula's construction, the score does not have an upper bound."
robj = Readability(rtext)
html = response.text
obj = BeautifulSoup(html,'lxml')
def getTagCount(c):
    ct  = obj.find_all(c)
    print("Count of '"+c+"' Tag is :" + str(len(ct)))
    print ct

if response.status_code == 200:
    print('Connection Success!\n')
elif response.status_code == 404:
    print('Not Found.\n')

getTagCount('a')

#print len(rtext.split())
fk = robj.flesch_kincaid()
s = robj.smog()
print "\nFlesch Kincaid Readabiity Score : "+str(fk.score)
print "Flesch Kincaid Readabiity Grade Level : "+str(fk.grade_level)
#print "SMOG Readabiity Score : "+str(s.score)
#print "SMOG Readabiity Grade Level : "+str(s.grade_level)

Exemple #5
0
def flesch_kincaid_score(essay):

    r = Readability(essay)
    f = r.flesch_kincaid()

    return f.score
Exemple #6
0
class ReadabilityTest(unittest.TestCase):
    def setUp(self):
        text = """
        “On a June day sometime in the early 1990s, encouraged by his friend and fellow economist Jörgen Weibull, Abhijit went swimming in the Baltic. He leaped in and instantly jumped out—he claims that his teeth continued to chatter for the next three days. In 2018, also in June, we went to the Baltic in Stockholm, several hundred miles farther north than the previous encounter. This time it was literally child’s play; our children frolicked in the water.
        Wherever we went in Sweden, the unusually warm weather was a topic of conversation. It was probably a portent of something everyone felt, but for the moment it was hard not to be quite delighted with the new opportunities for outdoor life it offered.”. 
        """
        self.readability = Readability(text)

    def test_ari(self):
        r = self.readability.ari()
        print(r)
        self.assertEqual(9.551245421245422, r.score)
        self.assertEqual(['10'], r.grade_levels)
        self.assertEqual([15, 16], r.ages)

    def test_coleman_liau(self):
        r = self.readability.coleman_liau()
        print(r)
        self.assertEqual(10.673162393162393, r.score)
        self.assertEqual('11', r.grade_level)

    def test_dale_chall(self):
        r = self.readability.dale_chall()
        print(r)
        self.assertEqual(9.32399010989011, r.score)
        self.assertEqual(['college'], r.grade_levels)

    def test_flesch(self):
        r = self.readability.flesch()
        print(r)
        self.assertEqual(51.039230769230784, r.score)
        self.assertEqual(['10', '11', '12'], r.grade_levels)
        self.assertEqual('fairly_difficult', r.ease)

    def test_flesch_kincaid(self):
        r = self.readability.flesch_kincaid()
        print(r)
        self.assertEqual(10.125531135531137, r.score)
        self.assertEqual('10', r.grade_level)

    def test_gunning_fog(self):
        r = self.readability.gunning_fog()
        print(r)
        self.assertEqual(12.4976800976801, r.score)
        self.assertEqual('12', r.grade_level)

    def test_linsear_write(self):
        r = self.readability.linsear_write()
        print(r)
        self.assertEqual(11.214285714285714, r.score)
        self.assertEqual('11', r.grade_level)

    def test_smog(self):
        text = """
        “On a June day sometime in the early 1990s, encouraged by his friend and fellow economist Jörgen Weibull, Abhijit went swimming in the Baltic. He leaped in and instantly jumped out—he claims that his teeth continued to chatter for the next three days. In 2018, also in June, we went to the Baltic in Stockholm, several hundred miles farther north than the previous encounter. This time it was literally child’s play; our children frolicked in the water.
        Wherever we went in Sweden, the unusually warm weather was a topic of conversation. It was probably a portent of something everyone felt, but for the moment it was hard not to be quite delighted with the new opportunities for outdoor life it offered.”. 
        """
        text = ' '.join(text for i in range(0, 5))

        readability = Readability(text)

        #Test SMOG with 30 sentences
        r1 = readability.smog()

        #Test SMOG with all sentences
        r2 = readability.smog(all_sentences=True)

        print("all_sentences=False: %s ; all_sentences=True: %s" % (r1, r2))
        self.assertEqual(12.516099999999998, r1.score)
        self.assertEqual('13', r1.grade_level)

        self.assertEqual(12.785403640627713, r2.score)
        self.assertEqual('13', r2.grade_level)

    def test_spache(self):
        r = self.readability.spache()
        print(r)
        self.assertEqual(7.164945054945054, r.score)
        self.assertEqual('7', r.grade_level)

    def test_print_stats(self):
        stats = self.readability.statistics()
        self.assertEqual(562, stats['num_letters'])
        self.assertEqual(117, stats['num_words'])
        self.assertEqual(7, stats['num_sentences'])
        self.assertEqual(20, stats['num_polysyllabic_words'])
class ReadabilityAnalyser:
    def __init__(self, text):
        self.readability = Readability(text)
        self.FLESCH_KINCAID = ['score', 'grade_level']
        self.FLESCH_EASE = ['score', 'ease', 'grade_level']
        self.DALE_CHALL = ['score', 'grade_level']
        self.ARI = ['score', 'grade_level', 'ages']
        self.CLI = ['score', 'grade_level']
        self.GUNNING_FOG = ['score', 'grade_level']
        self.SMOG = ['score', 'grade_level']
        self.SPACHE = ['score', 'grade_level']
        self.LINSEAR_WRITE = ['score', 'grade_level']
        self.values_index = self.initialize_value_index_array()

    def initialize_value_index_array(self):
        values_index = dict()
        values_index["flesch_kincaid"] = self.FLESCH_KINCAID
        values_index["flesch_ease"] = self.FLESCH_EASE
        values_index["dale_chall"] = self.DALE_CHALL
        values_index["ari"] = self.ARI
        values_index["cli"] = self.CLI
        values_index["gunning_fog"] = self.GUNNING_FOG
        values_index["smog_all"] = self.SMOG
        values_index["smog"] = self.SMOG
        values_index["spache"] = self.SPACHE
        values_index["linsear_write"] = self.LINSEAR_WRITE
        return values_index

    def flesch_kincaid(self, content, error_ignore=True):
        try:
            record = dict()
            fk = self.readability.flesch_kincaid()
            record['score'] = fk.score
            record['grade_level'] = fk.grade_level
            content["flesch_kincaid"] = record
        except ReadabilityException as e:
            if not error_ignore:
                content["flesch_kincaid"] = str(e)
                print(e)

    def flesch_ease(self, content, error_ignore=True):
        try:
            record = dict()
            flesch_ease = self.readability.flesch()
            record['score'] = flesch_ease.score
            record['ease'] = flesch_ease.ease
            record['grade_levels'] = flesch_ease.grade_levels
            content['flesch_ease'] = record
        except ReadabilityException as e:
            if not error_ignore:
                content['flesch_ease'] = str(e)
                print(e)

    def dale_chall(self, content, error_ignore=True):
        try:
            record = dict()
            dale_chall = self.readability.dale_chall()
            record['score'] = dale_chall.score
            record['grade_level'] = dale_chall.grade_levels
            content['dale_chall'] = record
        except ReadabilityException as e:
            if not error_ignore:
                content['dale_chall'] = str(e)
                print(e)

    def automated_readability_index(self, content, error_ignore=True):
        try:
            record = dict()
            ari = self.readability.ari()
            record['score'] = ari.score
            record['grade_level'] = ari.grade_levels
            record['ages'] = ari.ages
            content['ari'] = record
        except ReadabilityException as e:
            if not error_ignore:
                content['ari'] = str(e)
                print(e)

    def coleman_liau_index(self, content, error_ignore=True):
        try:
            record = dict()
            coleman_liau = self.readability.coleman_liau()
            record['score'] = coleman_liau.score
            record['grade_level'] = coleman_liau.grade_level
            content['cli'] = record
            print(record)
        except ReadabilityException as e:
            print(e)
            if not error_ignore:
                content['cli'] = str(e)
                print(e)

    def gunning_fog_index(self, content, error_ignore=True):
        try:
            record = dict()
            gunning_fog = self.readability.gunning_fog()
            record['score'] = gunning_fog.score
            record['grade_level'] = gunning_fog.grade_level
            content['gunning_fog'] = record
        except ReadabilityException as e:
            if not error_ignore:
                content['gunning_fog'] = str(e)
                print(e)

    def smog(self, content, all_sentences=False, error_ignore=True):
        record = dict()
        try:
            if all_sentences:
                smog = self.readability.smog(all_sentences=all_sentences)
                record['score'] = smog.score
                record['grade_level'] = smog.grade_level
                content['smog_all'] = record
            else:
                smog = self.readability.smog()
                record['score'] = smog.score
                record['grade_level'] = smog.grade_level
                content['smog'] = record
        except ReadabilityException as e:
            print(e)
            print(error_ignore)
            if not error_ignore:
                if all_sentences:
                    content['smog_all'] = str(e)
                else:
                    content['smog'] = str(e)
                print(e)

    def spache_readability_formula(self, content, error_ignore=True):
        try:
            record = dict()
            spache = self.readability.spache()
            record['score'] = spache.score
            record['grade_level'] = spache.grade_level
            content['spache'] = record
        except ReadabilityException as e:
            if not error_ignore:
                content['spache'] = str(e)
                print(e)

    def linsear_write(self, content, error_ignore=True):
        try:
            record = dict()
            linsear_write = self.readability.linsear_write()
            record['score'] = linsear_write.score
            record['grade_level'] = linsear_write.grade_level
            content['linsear_write'] = record
        except ReadabilityException as e:
            if not error_ignore:
                content['linsear_write'] = str(e)
                print(e)

    @staticmethod
    def check_readability_from_file(input_json_file, output_json_file):
        result = []
        json_file = load_as_json(input_json_file)
        for record in json_file:
            analyser = ReadabilityAnalyser(record['text'])
            analysed_file_record = dict()
            analysed_file_record['file'] = record['file']
            analysed_file_record['category'] = record['category']
            analyser.flesch_kincaid(analysed_file_record)
            analyser.flesch_ease(analysed_file_record)
            analyser.dale_chall(analysed_file_record)
            analyser.automated_readability_index(analysed_file_record)
            analyser.coleman_liau_index(analysed_file_record)
            analyser.gunning_fog_index(analysed_file_record)
            analyser.smog(analysed_file_record)
            analyser.smog(analysed_file_record, True)
            analyser.spache_readability_formula(analysed_file_record)
            analyser.linsear_write(analysed_file_record)
            result.append(analysed_file_record)
        save_as_json(result, output_json_file)

    def check_readability(self, use_methods=None, errors_included=True):
        result_analysis = dict()
        if use_methods is None or 'flesch_kincaid' in use_methods:
            self.flesch_kincaid(result_analysis,
                                error_ignore=not errors_included)
        if use_methods is None or 'flesch_ease' in use_methods:
            self.flesch_ease(result_analysis, error_ignore=not errors_included)
        if use_methods is None or 'dale_chall' in use_methods:
            self.dale_chall(result_analysis, error_ignore=not errors_included)
        if use_methods is None or 'ari' in use_methods:
            self.automated_readability_index(result_analysis,
                                             error_ignore=not errors_included)
        if use_methods is None or 'cli' in use_methods:
            self.coleman_liau_index(result_analysis,
                                    error_ignore=not errors_included)
        if use_methods is None or 'gunning_fog' in use_methods:
            self.gunning_fog_index(result_analysis,
                                   error_ignore=not errors_included)
        if use_methods is None or 'smog' in use_methods:
            self.smog(result_analysis, error_ignore=not errors_included)
        if use_methods is None or 'smog_all' in use_methods:
            self.smog(result_analysis, True, error_ignore=not errors_included)
        if use_methods is None or 'spache' in use_methods:
            self.spache_readability_formula(result_analysis,
                                            error_ignore=not errors_included)
        if use_methods is None or 'linsear_write' in use_methods:
            self.linsear_write(result_analysis,
                               error_ignore=not errors_included)
        return result_analysis

    @staticmethod
    def initialize_basic_dict(categories, values, process_category=True):
        record = dict()
        for value in values:
            record = ReadabilityAnalyser.initialize_dict(record, value)
        if process_category:
            for category in categories:
                record[category] = ReadabilityAnalyser.initialize_basic_dict(
                    categories, values, False)
        return record

    @staticmethod
    def initialize_dict(record, value):
        record['min_' + value] = 999999999
        record['max_' + value] = -999999999
        record['sum_' + value] = 0
        record['avg_' + value] = 0
        record['freq_' + value] = 0
        record['skipped_' + value] = 0
        return record

    def initialize_values(self, statistic, categories):
        statistic["flesch_kincaid"] = self.initialize_basic_dict(
            categories, self.FLESCH_KINCAID)
        statistic["flesch_ease"] = self.initialize_basic_dict(
            categories, self.FLESCH_EASE)
        statistic["dale_chall"] = self.initialize_basic_dict(
            categories, self.DALE_CHALL)
        statistic["ari"] = self.initialize_basic_dict(categories, self.ARI)
        statistic["cli"] = self.initialize_basic_dict(categories, self.CLI)
        statistic["gunning_fog"] = self.initialize_basic_dict(
            categories, self.GUNNING_FOG)
        statistic["smog_all"] = self.initialize_basic_dict(
            categories, self.SMOG)
        statistic["smog"] = self.initialize_basic_dict(categories, self.SMOG)
        statistic["spache"] = self.initialize_basic_dict(
            categories, self.SPACHE)
        statistic["linsear_write"] = self.initialize_basic_dict(
            categories, self.LINSEAR_WRITE)
        statistic['indexes'] = [
            "flesch_kincaid", "flesch_ease", "dale_chall", "ari", "cli",
            "gunning_fog", "smog_all", "smog", "spache", "linsear_write"
        ]
        statistic['categories'] = categories

    @staticmethod
    def fill_min_max_sum_category(index, value_index, statistics,
                                  readability_index, category):
        if index[value_index] < statistics[readability_index][category][
                'min_' + value_index]:
            statistics[readability_index][category][
                'min_' + value_index] = index[value_index]
        if index[value_index] > statistics[readability_index][category][
                'max_' + value_index]:
            statistics[readability_index][category][
                'max_' + value_index] = index[value_index]
        statistics[readability_index][category]['sum_' + value_index] = \
            statistics[readability_index][category]['sum_' + value_index] + index[value_index]

    @staticmethod
    def fill_min_max_sum_category_value(value, value_index, statistics,
                                        readability_index, category):
        if value < statistics[readability_index][category]['min_' +
                                                           value_index]:
            statistics[readability_index][category]['min_' +
                                                    value_index] = value
        if value > statistics[readability_index][category]['max_' +
                                                           value_index]:
            statistics[readability_index][category]['max_' +
                                                    value_index] = value
        statistics[readability_index][category]['sum_' + value_index] = \
            statistics[readability_index][category]['sum_' + value_index] + value

    @staticmethod
    def fill_min_max_sum(index, value_index, statistics, readability_index):
        if index[value_index] < statistics[readability_index]['min_' +
                                                              value_index]:
            statistics[readability_index]['min_' +
                                          value_index] = index[value_index]
        if index[value_index] > statistics[readability_index]['max_' +
                                                              value_index]:
            statistics[readability_index]['max_' +
                                          value_index] = index[value_index]
        statistics[readability_index]['sum_' + value_index] = \
            statistics[readability_index]['sum_' + value_index] + index[value_index]

    @staticmethod
    def fill_min_max_sum_value(value, value_index, statistics,
                               readability_index):
        if value < statistics[readability_index]['min_' + value_index]:
            statistics[readability_index]['min_' + value_index] = value
        if value > statistics[readability_index]['max_' + value_index]:
            statistics[readability_index]['max_' + value_index] = value
        statistics[readability_index]['sum_' + value_index] = \
            statistics[readability_index]['sum_' + value_index] + value

    @staticmethod
    def cast_to_float(value):
        try:
            return float(value)
        except ValueError:
            return None
        except TypeError:
            return None

    def record_analysis(self, record, statistics):
        for readability_index in statistics['indexes']:
            if 'category' in record:
                category = record['category']
                if readability_index in record:
                    index = record[readability_index]
                    for value_index in self.values_index[readability_index]:
                        if value_index in index:
                            obtained_value = ReadabilityAnalyser.cast_to_float(
                                index[value_index])

                            if obtained_value is not None:
                                index[value_index] = obtained_value
                                ReadabilityAnalyser.fill_min_max_sum_category(
                                    index, value_index, statistics,
                                    readability_index, category)
                                if 'freq_' + value_index not in statistics[
                                        readability_index][category]:
                                    statistics[readability_index][category][
                                        'freq_' + value_index] = 0
                                statistics[readability_index][category]['freq_' + value_index] = \
                                    statistics[readability_index][category]['freq_' + value_index] + 1
                            elif isinstance(index[value_index], list):
                                for rec in index[value_index]:

                                    if value_index not in statistics[
                                            readability_index][category]:
                                        statistics[readability_index][
                                            category][value_index] = dict()
                                    if isinstance(rec, str):
                                        if 'freq_' + rec not in statistics[
                                                readability_index][category][
                                                    value_index]:
                                            statistics[readability_index][
                                                category][value_index]['freq_'
                                                                       +
                                                                       rec] = 0
                                        statistics[readability_index][category][value_index]['freq_' + rec] = \
                                            statistics[readability_index][category][value_index]['freq_' + rec] + 1
                                    else:
                                        ReadabilityAnalyser.fill_min_max_sum_category_value(
                                            rec, value_index, statistics,
                                            readability_index, category)
                                        if 'freq_' + value_index not in \
                                                statistics[readability_index][category][value_index]:
                                            statistics[readability_index][
                                                category]['freq_' +
                                                          value_index] = 0
                                        statistics[readability_index][category]['freq_' + value_index] = \
                                            statistics[readability_index][category]['freq_' + value_index] + 1
                            elif isinstance(index[value_index], str):
                                rec = index[value_index]
                                if value_index not in statistics[
                                        readability_index][category]:
                                    statistics[readability_index][category][
                                        value_index] = dict()
                                if 'freq_' + rec not in statistics[
                                        readability_index][category][
                                            value_index]:
                                    statistics[readability_index][category][
                                        value_index]['freq_' + rec] = 0
                                statistics[readability_index][category][value_index]['freq_' + rec] = \
                                    statistics[readability_index][category][value_index]['freq_' + rec] + 1
                            else:
                                print("Uncategorized: " +
                                      str(index[value_index]))

                            statistics[readability_index][category]['freq_' + value_index] = \
                                statistics[readability_index][category]['freq_' + value_index] + 1
                        else:
                            statistics[readability_index][category]['skipped_' + value_index] = \
                                statistics[readability_index][category]['skipped_' + value_index] + 1
                else:
                    for value_index in self.values_index[readability_index]:
                        statistics[readability_index][category]['skipped_' + value_index] = \
                            statistics[readability_index][category]['skipped_' + value_index] + 1
            else:
                print("THIS: " + record)

            if readability_index in record:
                index = record[readability_index]
                for value_index in self.values_index[readability_index]:
                    if value_index in index:
                        obtained_value = ReadabilityAnalyser.cast_to_float(
                            index[value_index])

                        if obtained_value is not None:
                            index[value_index] = float(index[value_index])
                            ReadabilityAnalyser.fill_min_max_sum(
                                index, value_index, statistics,
                                readability_index)
                            if 'freq_' + value_index not in statistics[
                                    readability_index]:
                                statistics[readability_index]['freq_' +
                                                              value_index] = 0
                            statistics[readability_index]['freq_' + value_index] = \
                                statistics[readability_index]['freq_' + value_index] + 1
                        elif isinstance(index[value_index], list):
                            for rec in index[value_index]:
                                if value_index not in statistics[
                                        readability_index]:
                                    statistics[readability_index][
                                        value_index] = dict()

                                if isinstance(rec, str):
                                    # print(value_index + " " + str(index[value_index]))
                                    if 'freq_' + rec not in statistics[
                                            readability_index][value_index]:
                                        statistics[readability_index][
                                            value_index]['freq_' + rec] = 0
                                    statistics[readability_index][value_index]['freq_' + rec] = \
                                        statistics[readability_index][value_index]['freq_' + rec] + 1
                                else:
                                    ReadabilityAnalyser.fill_min_max_sum_value(
                                        rec, value_index, statistics,
                                        readability_index)
                                    if 'freq_' + value_index not in statistics[
                                            readability_index][value_index]:
                                        statistics[readability_index][
                                            'freq_' + value_index] = 0
                                    statistics[readability_index]['freq_' + value_index] = \
                                        statistics[readability_index]['freq_' + value_index] + 1
                        elif isinstance(index[value_index], str):
                            rec = index[value_index]
                            if value_index not in statistics[
                                    readability_index]:
                                statistics[readability_index][
                                    value_index] = dict()
                            if 'freq_' + rec not in statistics[
                                    readability_index][value_index]:
                                statistics[readability_index][value_index][
                                    'freq_' + rec] = 0
                            statistics[readability_index][value_index]['freq_' + rec] = \
                                statistics[readability_index][value_index]['freq_' + rec] + 1
                        else:
                            print("Uncategorized: " + str(index[value_index]))

                        statistics[readability_index]['freq_' + value_index] = \
                            statistics[readability_index]['freq_' + value_index] + 1
                    else:
                        statistics[readability_index]['skipped_' + value_index] = \
                            statistics[readability_index]['skipped_' + value_index] + 1
            else:
                for value_index in self.values_index[readability_index]:
                    statistics[readability_index]['skipped_' + value_index] = \
                        statistics[readability_index]['skipped_' + value_index] + 1

    def count_average(self, statistics):
        for readability_index in statistics['indexes']:
            for value_index in self.values_index[readability_index]:
                if statistics[readability_index]['sum_' + value_index] != 0:
                    statistics[readability_index]['avg_' + value_index] = \
                        statistics[readability_index]['sum_' + value_index] / statistics[readability_index][
                            'freq_' + value_index]
                else:
                    statistics[readability_index]['sum_' + value_index] = 0
                for category in statistics['categories']:
                    if statistics[readability_index][category][
                            'sum_' + value_index] != 0:
                        statistics[readability_index][category]['avg_' + value_index] = \
                            statistics[readability_index][category]['sum_' + value_index] / \
                            statistics[readability_index][category]['freq_' + value_index]
                    else:
                        statistics[readability_index][category][
                            'sum_' + value_index] = 0

    def analyse_readability_file(self, readability_file, categories):
        statistic = dict()
        self.initialize_values(statistic, categories)

        file = load_as_json(readability_file)
        for record in file:
            self.record_analysis(record, statistic)

        self.count_average(statistic)
        return statistic

    def analyse_readability_file_save_results(self, readability_file,
                                              output_statistics_file,
                                              categories):
        statistics = self.analyse_readability_file(readability_file,
                                                   categories)
        save_as_json(statistics, output_statistics_file)
def suggest():

    #get language
    lang = request.args.get('lang')

    if lang == 'en':

        #get url
        url = request.args.get('url')

        #get the html from the URL
        import requests
        r = requests.get(url)
        html = r.text

        #get the html content as text - get content from the "main" tag
        from bs4 import BeautifulSoup
        original_soup = BeautifulSoup(html, features="lxml").find('main')
        original_text = original_soup.get_text()

        #get initial readability total_score
        from readability import Readability
        r_o = Readability(original_text)
        original_fk = r_o.flesch_kincaid()

        #add periods after bullet points and headings so that the Flesch Kicaid score considers them as sentences
        html1 = html.replace("</li>", ".</li>")
        html2 = html1.replace("</h1>", ".</h1>")
        html3 = html2.replace("</h2>", ".</h2>")
        html4 = html3.replace("</h3>", ".</h3>")
        html5 = html4.replace("</h4>", ".</h4>")
        html6 = html5.replace("</h5>", ".</h5>")
        html7 = html6.replace("</h6>", ".</h6>")

        #get adjusted readability total_score
        revised_soup = BeautifulSoup(html7, features="lxml").find('main')
        revised_text = revised_soup.get_text()

        from readability import Readability
        r_f = Readability(revised_text)
        final_fk = r_f.flesch_kincaid()

        #tokenize the text for processing
        import nltk
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('\w+')
        tokens = tokenizer.tokenize(revised_text)
        words = []
        for word in tokens:
            words.append(word.lower())

        #remove stop words from the tokens to get only the meaningful words
        nltk.download('stopwords')
        sw = nltk.corpus.stopwords.words('english')
        words_ns = []
        for word in words:
            if word not in sw:
                words_ns.append(word)

        #get the 15 most used words in the text
        from nltk import FreqDist
        fdist1 = FreqDist(words_ns)
        most_common = fdist1.most_common(20)

        #get all headings and calculate how many words on average between headings
        headings = original_soup.findAll(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
        hratio = len(words) / (len(headings))

        #get all paragraphs and all bulleted list, and calculate how many words per paragraph on average
        paragraphs = original_soup.findAll(['p', 'ul'])
        pratio = (len(words) / len(paragraphs))

        #calculate points for readability
        if final_fk.score <= 6:
            fkpoints = 60
        elif final_fk.score >= 18:
            fkpoints = 0
        else:
            fkpoints = (60 - ((final_fk.score - 6) * 5))

        #calculate points for number of words between headings
        if hratio <= 40:
            hpoints = 20
        elif hratio >= 200:
            hpoints = 0
        else:
            hpoints = (20 - ((hratio - 40) * 0.125))

        #calculate points for number of words per paragraph
        if pratio <= 30:
            ppoints = 20
        elif pratio >= 80:
            ppoints = 0
        else:
            ppoints = (20 - ((pratio - 30) * 0.4))

        #add all points
        total_score = fkpoints + hpoints + ppoints
        total_score = format(total_score, '.2f')
        fkpoints = format(fkpoints, '.2f')
        final_fk_score = format(final_fk.score, '.2f')
        hpoints = format(hpoints, '.2f')
        hratio = format(hratio, '.2f')
        ppoints = format(ppoints, '.2f')
        pratio = format(pratio, '.2f')
        total_words = len(words)

        return render_template("read_score_en.html",
                               total_score=total_score,
                               fkpoints=fkpoints,
                               final_fk_score=final_fk_score,
                               hpoints=hpoints,
                               hratio=hratio,
                               ppoints=ppoints,
                               pratio=pratio,
                               total_words=total_words,
                               most_common=most_common)

    if lang == 'fr':

        #get url
        url = request.args.get('url')

        #get the html from the URL
        import requests
        r = requests.get(url)
        html = r.text

        #get the html content as text - get content from the "main" tag
        from bs4 import BeautifulSoup
        original_soup = BeautifulSoup(html, features="lxml").find('main')
        original_text = original_soup.get_text()

        #get initial readability total_score
        from readability import Readability
        r_o = Readability(original_text)
        original_fk = r_o.flesch_kincaid()

        #add periods after bullet points and headings so that the Flesch Kicaid score considers them as sentences
        html1 = html.replace("</li>", ".</li>")
        html2 = html1.replace("</h1>", ".</h1>")
        html3 = html2.replace("</h2>", ".</h2>")
        html4 = html3.replace("</h3>", ".</h3>")
        html5 = html4.replace("</h4>", ".</h4>")
        html6 = html5.replace("</h5>", ".</h5>")
        html7 = html6.replace("</h6>", ".</h6>")

        #get adjusted readability total_score
        revised_soup = BeautifulSoup(html7, features="lxml").find('main')
        revised_text = revised_soup.get_text()

        from readability import Readability
        r_f = Readability(revised_text)
        final_fk = r_f.flesch_kincaid()

        #tokenize the text for processing
        import nltk
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('\w+')
        tokens = tokenizer.tokenize(revised_text)
        words = []
        for word in tokens:
            words.append(word.lower())

        #remove stop words from the tokens to get only the meaningful words
        nltk.download('stopwords')
        sw = nltk.corpus.stopwords.words('french')
        words_ns = []
        for word in words:
            if word not in sw:
                words_ns.append(word)

        #get the 15 most used words in the text
        from nltk import FreqDist
        fdist1 = FreqDist(words_ns)
        most_common = fdist1.most_common(20)

        #get all headings and calculate how many words on average between headings
        headings = original_soup.findAll(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
        hratio = len(words) / (len(headings))

        #get all paragraphs and all bulleted list, and calculate how many words per paragraph on average
        paragraphs = original_soup.findAll(['p', 'ul'])
        pratio = (len(words) / len(paragraphs))

        #calculate points for readability
        if final_fk.score <= 6:
            fkpoints = 60
        elif final_fk.score >= 18:
            fkpoints = 0
        else:
            fkpoints = (60 - ((final_fk.score - 6) * 5))

        #calculate points for number of words between headings
        if hratio <= 40:
            hpoints = 20
        elif hratio >= 200:
            hpoints = 0
        else:
            hpoints = (20 - ((hratio - 40) * 0.125))

        #calculate points for number of words per paragraph
        if pratio <= 30:
            ppoints = 20
        elif pratio >= 80:
            ppoints = 0
        else:
            ppoints = (20 - ((pratio - 30) * 0.4))

        #add all points
        total_score = fkpoints + hpoints + ppoints
        total_score = format(total_score, '.2f')
        fkpoints = format(fkpoints, '.2f')
        final_fk_score = format(final_fk.score, '.2f')
        hpoints = format(hpoints, '.2f')
        hratio = format(hratio, '.2f')
        ppoints = format(ppoints, '.2f')
        pratio = format(pratio, '.2f')
        total_words = len(words)

        return render_template("read_score_fr.html",
                               total_score=total_score,
                               fkpoints=fkpoints,
                               final_fk_score=final_fk_score,
                               hpoints=hpoints,
                               hratio=hratio,
                               ppoints=ppoints,
                               pratio=pratio,
                               total_words=total_words,
                               most_common=most_common)
def suggest():

    #get  language
    lang = request.args.get('lang', 'en')
    import nltk
    nltk.download('punkt')

    if lang == 'en':
        word_column_names = ['Count', 'Word']

    if lang == 'fr':
        word_column_names = ['Nombre', 'Mot']

	    #get url
    url = request.args.get('url', 'https://www.canada.ca/en.html')

    #get the html from the URL
    import requests
    r = requests.get(url)
    html = r.text

    #get the html content as text - get content from the "main" tag
    from bs4 import BeautifulSoup
    original_soup = BeautifulSoup(html, features="lxml").find('main')
    original_text = original_soup.get_text()
    original_text = original_text.replace('..', '.')
    original_text = original_text.replace('.', '. ')
    original_text = original_text[:original_text.find("defPreFooter")]
    original_text = original_text.replace('\n', '')
    original_text = original_text.replace('\t', '')
    original_text = original_text.replace('\r', '')

    #get initial readability total_score
    from readability import Readability
    r_o = Readability(original_text)
    original_fk = r_o.flesch_kincaid()
    original_score = original_fk.score
    original_score = format(original_score, '.2f')

    #add periods after bullet points and headings so that the Flesch Kicaid score considers them as sentences
    html1 = html.replace("</li>", ".</li>")
    html2 = html1.replace("</h1>", ".</h1>")
    html3 = html2.replace("</h2>", ".</h2>")
    html4 = html3.replace("</h3>", ".</h3>")
    html5 = html4.replace("</h4>", ".</h4>")
    html6 = html5.replace("</h5>", ".</h5>")
    html7 = html6.replace("</h6>", ".</h6>")

    #get adjusted readability total_score
    revised_soup = BeautifulSoup(html7, features="lxml").find('main')
    for t in revised_soup.select('table'):
        t.extract()
    revised_text = revised_soup.get_text()
    revised_text = revised_text.replace('..', '.')
    revised_text = revised_text .replace('.', '. ')
    revised_text = revised_text[:revised_text.find("defPreFooter")]
    revised_text = revised_text.replace('\n', '')
    revised_text = revised_text.replace('\t', '')
    revised_text = revised_text.replace('\r', '')

    from readability import Readability
    r_f = Readability(revised_text)
    final_fk = r_f.flesch_kincaid()


    #tokenize the text for processing

    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer('\w+')
    tokens = tokenizer.tokenize(revised_text)
    words = []
    for word in tokens:
        words.append(word.lower())

    #remove stop words from the tokens to get only the meaningful words
    nltk.download('stopwords')
    sw_en = nltk.corpus.stopwords.words('english')
    words_ns_en = []
    for word in words:
        if word not in sw_en:
            words_ns_en.append(word)

    #get the 15 most used words in the text
    from nltk import FreqDist
    fdist1_en = FreqDist(words_ns_en)
    most_common_en = fdist1_en.most_common(20)
    mc_en = pd.DataFrame(most_common_en, columns =['Word', 'Count'])
    mc_en = mc_en[['Count', 'Word']]


    sw_fr = nltk.corpus.stopwords.words('french')
    words_ns_fr = []
    for word in words:
        if word not in sw_fr:
            words_ns_fr.append(word)

    #get the 15 most used words in the text
    from nltk import FreqDist
    fdist1_fr = FreqDist(words_ns_fr)
    most_common_fr = fdist1_fr.most_common(20)
    mc_fr = pd.DataFrame(most_common_fr, columns =['Mot', 'Nombre'])



    #get all headings and calculate how many words on average between headings
    headings = original_soup.findAll(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    len_headings = len(headings)
    hratio = len(words)/(len(headings))

    #get all paragraphs and all bulleted list, and calculate how many words per paragraph on average
    paragraphs = original_soup.findAll(['p', 'ul'])
    len_par = len(paragraphs)
    pratio = (len(words)/len(paragraphs))

    #calculate points for readability
    if final_fk.score <= 6:
        fkpoints = 60
    elif final_fk.score >= 18:
        fkpoints = 0
    else :
        fkpoints = (60-((final_fk.score-6)*5))

    #calculate points for number of words between headings
    if hratio <= 40:
        hpoints = 20
    elif hratio >= 200:
        hpoints = 0
    else :
        hpoints = (20-((hratio-40)*0.125    ))

    #calculate points for number of words per paragraph
    if pratio <= 30:
        ppoints = 20
    elif pratio >= 80:
        ppoints = 0
    else :
        ppoints = (20-((pratio-30)*0.4))

    #add all points
    total_score = fkpoints+hpoints+ppoints
    total_score = format(total_score, '.2f')
    fkpoints = format(fkpoints, '.2f')
    final_fk_score = format(final_fk.score, '.2f')
    hpoints = format(hpoints, '.2f')
    hratio = format(hratio, '.2f')
    ppoints = format(ppoints, '.2f')
    pratio = format(pratio, '.2f')
    total_words = len(words)

    total_score = float(total_score)
    if total_score >= 90:
        if lang=='en':
            score = 'Outstanding!'
        if lang=='fr':
            score = 'Excellent!'
    elif total_score >= 80 and total_score < 90:
        if lang=='en':
            score = 'Very good!'
        if lang=='fr':
            score = 'Très bien!'
    elif total_score >= 70 and total_score < 80:
        if lang=='en':
            score = 'Not too bad'
        if lang=='fr':
            score = 'Pas mal'
    elif total_score >= 60 and total_score < 70:
        if lang=='en':
            score = 'Needs work'
        if lang=='fr':
            score = 'À travailler'
    elif total_score >= 50 and total_score < 60:
        if lang=='en':
            score = 'Needs a lot of work'
        if lang=='fr':
            score = 'Besoin de beaucoup de travail'
    elif total_score < 50:
        if lang=='en':
            score = "Please don't do this to people..."
        if lang=='fr':
            score = "S'il vous plaît, il faut faire quelque chose..."

    if lang == "en":
        return render_template("read_score_en.html", total_score = total_score, fkpoints = fkpoints, final_fk_score = final_fk_score, hpoints = hpoints, hratio = hratio, ppoints = ppoints, pratio = pratio, total_words = total_words, url = url, lang = lang, word_column_names = word_column_names, row_data_word_en = list(mc_en.values.tolist()), row_data_word_fr = list(mc_fr.values.tolist()), zip = zip, score = score, len_headings = len_headings, len_par = len_par, original_score = original_score)

    if lang == "fr":
        return render_template("read_score_fr.html", total_score = total_score, fkpoints = fkpoints, final_fk_score = final_fk_score, hpoints = hpoints, hratio = hratio, ppoints = ppoints, pratio = pratio, total_words = total_words, url = url, lang = lang, word_column_names = word_column_names, row_data_word_en = list(mc_en.values.tolist()), row_data_word_fr = list(mc_fr.values.tolist()), zip = zip, score = score, len_headings = len_headings, len_par = len_par, original_score = original_score)
from readability import Readability

text = open('C:\\...\\ch15_MLK-IHaveADream.txt')
text_up = text.read()

r = Readability(text_up)
flesch_kincaidR = r.flesch_kincaid()

print('The text has a grade ' + flesch_kincaidR.grade_level +
      ' readability level.')
Exemple #11
0
from readability import Readability

path = "IAEA_output"
str = ""
with open(path, encoding="utf-8") as f:
    for line in f.readlines():
        str = line
r = Readability(str)
fk = r.flesch_kincaid()
print(fk.score)
print(fk.grade_level)

s = r.smog()
print(s.score)
print(s.grade_level)

dc = r.dale_chall()
print(dc.score)
print(dc.grade_levels)

cl = r.coleman_liau()
print(cl.score)
print(cl.grade_level)

gf = r.gunning_fog()
print(gf.score)
print(gf.grade_level)

# lw = r.linsear_write()
# print(lw.score)
# print(lw.grade_level)
import pandas as pd
from readability import Readability
import sys
import csv
import json
csv.field_size_limit(sys.maxsize)

description_features = {}
with open(
        "detail_description_data/detail-desc-text-{}.tsv".format(sys.argv[1]),
        'r') as tsvin:
    tsvin = csv.reader(tsvin, delimiter='\t')
    for row in tsvin:
        patent_id = row[0]
        detail_description_text = row[1]
        try:
            description_word_count = len(detail_description_text.split())
            fig_counts = detail_description_text.lower().count(
                "fig.") + detail_description_text.lower().count("figs.")
            r = Readability(detail_description_text.replace('aed-512', ''))
            fk_score = r.flesch_kincaid().score
        except:
            continue
        description_features[patent_id] = (description_word_count, fk_score,
                                           fig_counts)
with open(
        "feature_data/description_features/description_features_{}.json".
        format(sys.argv[1]), 'w') as f:
    json.dump(description_features, f)
l_flesch = []
l_gunning_fog = []
l_coleman_liau = []
l_dale_chall = []
l_ari = []
l_linsear_write = []
l_spache = []
l_flesch_ease = []

for i in os.listdir(PATH):
    if not i.startswith('.'):
        if i not in l_not_use:
            with open(PATH + i, 'r') as f:
                text = f.read()
                r = Readability(clean(text))
                s1 = r.flesch_kincaid()
                s2 = r.flesch()
                s3 = r.gunning_fog()
                s4 = r.coleman_liau()
                s5 = r.dale_chall()
                s6 = r.ari()
                s7 = r.linsear_write()
                # r.smog()
                s8 = r.spache()
                l_flesch_kincaid.append(s1.score)
                l_flesch.append(s2.score)
                l_flesch_ease.append(s2.ease)
                l_gunning_fog.append(s3.score)
                l_coleman_liau.append(s4.score)
                l_dale_chall.append(s5.score)
                l_ari.append(s6.score)
def __flesch_kincaid(r: Readability) -> float:
    try:
        lvl = r.flesch_kincaid().grade_level
        return float(lvl)
    except ReadabilityException:
        return None
Exemple #15
0
class ReadabilityTest(unittest.TestCase):
    def setUp(self):
        text = """
        In linguistics, the Gunning fog index is a readability test for English writing. The index estimates the years of formal education a person needs to understand the text on the first reading. For instance, a fog index of 12 requires the reading level of a United States high school senior (around 18 years old). The test was developed in 1952 by Robert Gunning, an American businessman who had been involved in newspaper and textbook publishing.
        The fog index is commonly used to confirm that text can be read easily by the intended audience. Texts for a wide audience generally need a fog index less than 12. Texts requiring near-universal understanding generally need an index less than 8.
        """
        self.readability = Readability(text)

    def test_ari(self):
        r = self.readability.ari()
        print(r)
        self.assertEqual(9.551245421245422, r.score)
        self.assertEqual(['10'], r.grade_levels)
        self.assertEqual([15, 16], r.ages)

    def test_coleman_liau(self):
        r = self.readability.coleman_liau()
        print(r)
        self.assertEqual(10.673162393162393, r.score)
        self.assertEqual('11', r.grade_level)

    def test_dale_chall(self):
        r = self.readability.dale_chall()
        print(r)
        self.assertEqual(9.32399010989011, r.score)
        self.assertEqual(['college'], r.grade_levels)

    def test_flesch(self):
        r = self.readability.flesch()
        print(r)
        self.assertEqual(51.039230769230784, r.score)
        self.assertEqual(['10', '11', '12'], r.grade_levels)
        self.assertEqual('fairly_difficult', r.ease)

    def test_flesch_kincaid(self):
        r = self.readability.flesch_kincaid()
        print(r)
        self.assertEqual(10.125531135531137, r.score)
        self.assertEqual('10', r.grade_level)

    def test_gunning_fog(self):
        r = self.readability.gunning_fog()
        print(r)
        self.assertEqual(12.4976800976801, r.score)
        self.assertEqual('12', r.grade_level)

    def test_linsear_write(self):
        r = self.readability.linsear_write()
        print(r)
        self.assertEqual(11.214285714285714, r.score)
        self.assertEqual('11', r.grade_level)

    def test_smog(self):
        text = """
        In linguistics, the Gunning fog index is a readability test for English writing. The index estimates the years of formal education a person needs to understand the text on the first reading. For instance, a fog index of 12 requires the reading level of a United States high school senior (around 18 years old). The test was developed in 1952 by Robert Gunning, an American businessman who had been involved in newspaper and textbook publishing.
        The fog index is commonly used to confirm that text can be read easily by the intended audience. Texts for a wide audience generally need a fog index less than 12. Texts requiring near-universal understanding generally need an index less than 8.
        """
        text = ' '.join(text for i in range(0, 5))

        readability = Readability(text)
        r = readability.smog()

        print(r)
        self.assertEqual(12.516099999999998, r.score)
        self.assertEqual('13', r.grade_level)

    def test_spache(self):
        r = self.readability.spache()
        print(r)
        self.assertEqual(7.164945054945054, r.score)
        self.assertEqual('7', r.grade_level)

    def test_print_stats(self):
        stats = self.readability.statistics()
        self.assertEqual(562, stats['num_letters'])
        self.assertEqual(117, stats['num_words'])
        self.assertEqual(7, stats['num_sentences'])
        self.assertEqual(20, stats['num_polysyllabic_words'])
Exemple #16
0

#------------------
# Readability
#------------------

st.header('Readability')

# Context 
passage = st.text_area("Candidate Bible Passage (English)", value='', 
        max_chars=None, key='readability_passage')

# Calculate readability
r = Readability(passage)

# Display readability
data = [
        ['Flesch-Kincaid Score', r.flesch_kincaid().score],
        ['Flesch Reading Ease', r.flesch().ease],
        ['Dale Chall Readability Score', r.dale_chall().score],
        ['Automated Readability Index Score', r.ari().score],
        ['Coleman Liau Index', r.coleman_liau().score],
        ['Gunning Fog', r.gunning_fog().score],
        ['Linsear Write', r.linsear_write().score],
        ['Spache Readability Formula', r.spache().score]
        ]
df = pd.DataFrame(data, columns=['Readability Metric', 'Value'])
if st.button('Assess Readability', key=None):
    st.write(df)

Exemple #17
0
def generate_caption_stats(dataframe: pd.DataFrame,
                           pos_tag_stats: bool = True,
                           readability_scores: bool = True,
                           n_spacy_workers: int = 6,
                           spacy_model: str = "en_core_web_lg",
                           backend: MetadataGeneratorBackend = MetadataGeneratorBackend.SPACY):
    logger.info(f"Generating caption statistics using {backend.upper()}...")
    start = time.time()

    # Tokens and sentences
    num_tok = []
    num_sent = []
    # Min and Max length of sentences
    min_sent_len = []
    max_sent_len = []

    # Named Entities
    num_ne = []
    ne_texts = []  # surface form of the NEs
    ne_types = []  # types of the NEs

    # POS Tags
    # counts
    num_noun = []  # nouns (cat, dog, house, tree, ...)
    num_propn = []  # proper nouns (Denver, Hamburg, Peter, Tesla, ...)
    num_conj = []  # conjunctions (and, or, ...)
    num_verb = []  # verbs
    num_sym = []  # symbols (!,#,?, ...)
    num_num = []  # numbers (IV, 1 billion, 1312, ...)
    num_adp = []  # adpositions (on, under, in, at, ...)
    num_adj = []  # adjectives (nice, fast, cool, ...)

    # ratios
    ratio_ne_tokens, num_ne_tok = [], []
    ratio_noun_tokens = []
    ratio_propn_tokens = []
    ratio_all_noun_tokens = []

    # readability scores
    fk_gl_score = []
    fk_re_score = []
    dc_score = []

    with tqdm(total=len(dataframe)) as pbar:
        # TODO extract all of this code into an own module and have separate metadata generators for spaCy, nltk, etc.
        if backend == MetadataGeneratorBackend.SPACY:
            # init spacy TODO: download the required model(s)
            spacy_nlp = spacy.load(spacy_model)
            if readability_scores:
                spacy_nlp.add_pipe(Readability())
            # TODO whats a good batch_size?
            for doc in spacy_nlp.pipe(dataframe['caption'].astype(str),
                                      n_process=n_spacy_workers):
                # num tokens
                num_tok.append(len(doc))

                # num sentences
                num_sent.append(len(list(doc.sents)))
                # min/max length of sentences
                min_len = 10000
                max_len = -1
                for s in doc.sents:
                    min_len = min(min_len, len(s))
                    max_len = max(max_len, len(s))
                min_sent_len.append(min_len)
                max_sent_len.append(max_len)

                # named entities
                num_ne.append(len(doc.ents))
                txt, typ = [], []
                for ent in doc.ents:
                    typ.append(ent.label_)
                    txt.append(ent.text)
                ne_texts.append(txt)
                ne_types.append(typ)

                # readability scores
                if readability_scores:
                    fk_gl_score.append(doc._.flesch_kincaid_grade_level)
                    fk_re_score.append(doc._.flesch_kincaid_reading_ease)
                    dc_score.append(doc._.dale_chall)

                # POS Tags
                if pos_tag_stats:
                    noun, propn, conj, verb, sym, num, adp, adj, ne_tok = 0, 0, 0, 0, 0, 0, 0, 0, 0
                    for t in doc:
                        if t.pos_ == 'CONJ':
                            conj += 1
                        elif t.pos_ == 'ADJ':
                            adj += 1
                        elif t.pos_ == 'NOUN':
                            noun += 1
                        elif t.pos_ == 'NUM':
                            num += 1
                        elif t.pos_ == 'PROPN':
                            propn += 1
                        elif t.pos_ == 'SYM':
                            sym += 1
                        elif t.pos_ == 'VERB':
                            verb += 1
                        elif t.pos_ == 'ADP':
                            adp += 1

                        # number of tokens associated with a NE (to compute the ratio)
                        if t.ent_iob_ == 'I' or t.ent_iob_ == 'B':
                            ne_tok += 1

                    num_noun.append(noun)
                    num_propn.append(propn)
                    num_conj.append(conj)
                    num_verb.append(verb)
                    num_sym.append(sym)
                    num_num.append(num)
                    num_adp.append(adp)
                    num_adj.append(adj)

                    num_ne_tok.append(ne_tok)

                pbar.update(1)
        elif backend == MetadataGeneratorBackend.NLTK:
            nltk.download('punkt')
            nltk.download('words')
            nltk.download('averaged_perceptron_tagger')
            nltk.download('universal_tagset')
            nltk.download('universal_treebanks_v20')
            nltk.download('maxent_ne_chunker')

            for cap in dataframe['caption'].astype(str):
                # num tokens
                num_tok.append(len(nltk.word_tokenize(cap)))

                # num sentences
                sents = nltk.sent_tokenize(cap)
                num_sent.append(len(sents))

                # min/max length of sentences
                min_len = 10000
                max_len = -1
                s_toks = []
                for s in sents:
                    toks = nltk.word_tokenize(s)
                    s_toks.append(toks)
                    min_len = min(min_len, len(toks))
                    max_len = max(max_len, len(toks))
                min_sent_len.append(min_len)
                max_sent_len.append(max_len)

                # readability scores
                # FIXME currently not usable with NLTK... (because NaN values are dropped)
                #  there is also an error while calling t Readability(cap) ctor...
                if False:
                    try:
                        r = Readability(cap)
                        flesch = r.flesch_kincaid()
                        fk_gl_score.append(flesch.grade_level)
                        fk_re_score.append(flesch.score)
                        dc_score.append(r.dale_chall().score)
                    except (ReadabilityException, Exception):
                        fk_gl_score.append(np.NaN)
                        fk_re_score.append(np.NaN)
                        dc_score.append(np.NaN)

                if pos_tag_stats:
                    sent_pos_tags = nltk.pos_tag_sents(s_toks, 'universal')
                    noun, propn, conj, verb, sym, num, adp, adj, ne_tok = 0, 0, 0, 0, 0, 0, 0, 0, 0
                    for spt in sent_pos_tags:
                        for pt in spt:
                            if pt[1].upper() == 'CONJ':
                                conj += 1
                            elif pt[1].upper() == 'ADJ':
                                adj += 1
                            elif pt[1].upper() == 'NOUN':
                                noun += 1
                            elif pt[1].upper() == 'NUM':
                                num += 1
                            elif pt[1].upper() == 'PROPN':
                                propn += 1
                            elif pt[1].upper() == 'SYM':
                                sym += 1
                            elif pt[1].upper() == 'VERB':
                                verb += 1
                            elif pt[1].upper() == 'ADP':
                                adp += 1

                    num_noun.append(noun)
                    num_propn.append(propn)
                    num_conj.append(conj)
                    num_verb.append(verb)
                    num_sym.append(sym)
                    num_num.append(num)
                    num_adp.append(adp)
                    num_adj.append(adj)

                # named entities
                # we have to tag again with a different tag set (upenn tree) for WAY better NER performance
                num_nes, num_nes_tok = 0, 0
                txt, typ = [], []
                nes_sent = nltk.ne_chunk_sents(nltk.pos_tag_sents(map(nltk.word_tokenize, nltk.sent_tokenize(cap))))
                for nes in nes_sent:
                    for ne in nes:
                        if isinstance(ne, nltk.Tree):
                            num_nes += 1
                            typ.append(str(ne.label()))
                            t = ""
                            for tok in ne:
                                t += tok[0] + " "
                            txt.append(t.strip())
                            num_nes_tok += len(ne)
                num_ne.append(num_nes)
                ne_texts.append(txt)
                ne_types.append(typ)
                num_ne_tok.append(num_nes_tok)

                pbar.update(1)
        elif backend == MetadataGeneratorBackend.POLYGLOT:
            # init
            # pandarallel.initialize()  # FIXME doens't work..
            downloader.download("embeddings2.en")
            downloader.download("ner2.en")
            downloader.download("pos2.en")

            def __gen_polyglot_metadata_per_caption(df, pb):
                d = {
                    'num_tok': 0,
                    'num_sent': 0,
                    'min_sent_len': 0,
                    'max_sent_len': 0,
                    'num_ne': 0,
                    'ne_types': [],
                    'ne_texts': [],
                    'num_nouns': 0,
                    'num_propn': 0,
                    'num_conj': 0,
                    'num_verb': 0,
                    'num_sym': 0,
                    'num_num': 0,
                    'num_adp': 0,
                    'num_adj': 0,
                    'ratio_ne_tok': 0.,
                    'ratio_noun_tok': 0.,
                    'ratio_propn_tok': 0.,
                    'ratio_all_noun_tok': 0.,
                }
                try:
                    caption = str(df['caption']).encode('utf-8')
                    # https://github.com/aboSamoor/polyglot/issues/71
                    # removing "bad unicode" characters to avoid runtime exceptions
                    # caption = str(caption, encoding='utf-8')
                    caption = regex.sub(r"\p{C}", "", caption.decode('utf-8'))

                    pg = Text(caption, hint_language_code='en')
                    pg.language = 'en'
                    # num tokens
                    n_tok = len(pg.words)

                    # num sentences
                    n_sent = len(pg.sentences)

                    # min/max length of sentences
                    min_s_len = 10000
                    max_s_len = -1
                    for s in pg.sentences:
                        min_s_len = min(min_s_len, len(s.words))
                        max_s_len = max(max_s_len, len(s.words))
                    # readability scores
                    # FIXME only available with spacy currently

                    # POS tags
                    n_noun, n_propn, n_conj, n_verb, n_sym, n_num, n_adp, n_adj = 0, 0, 0, 0, 0, 0, 0, 0
                    for pos in pg.pos_tags:
                        if pos[1].upper() == 'CONJ':
                            n_conj += 1
                        elif pos[1].upper() == 'ADJ':
                            n_adj += 1
                        elif pos[1].upper() == 'NOUN':
                            n_noun += 1
                        elif pos[1].upper() == 'NUM':
                            n_num += 1
                        elif pos[1].upper() == 'PROPN':
                            n_propn += 1
                        elif pos[1].upper() == 'SYM':
                            n_sym += 1
                        elif pos[1].upper() == 'VERB':
                            n_verb += 1
                        elif pos[1].upper() == 'ADP':
                            n_adp += 1

                    # named entities
                    num_nes_tok, ne_txt, ne_typ = 0, [], []
                    num_nes = len(pg.entities)
                    for ne in pg.entities:
                        num_nes_tok += len(ne)
                        ne_txt.append(" ".join(ne))
                        ne_typ.append(ne.tag)

                    # compute the rations
                    r_ne_tokens = num_nes_tok / n_tok
                    r_noun_tokens = n_noun / n_tok
                    r_propn_tokens = n_propn / n_tok
                    r_all_noun_tokens = (n_noun + n_propn) / n_tok
                    d = {
                        'num_tok': n_tok,
                        'num_sent': n_sent,
                        'min_sent_len': min_s_len,
                        'max_sent_len': max_s_len,
                        'num_ne': num_nes,
                        'ne_types': ne_typ,
                        'ne_texts': ne_txt,
                        'num_nouns': n_noun,
                        'num_propn': n_propn,
                        'num_conj': n_conj,
                        'num_verb': n_verb,
                        'num_sym': n_sym,
                        'num_num': n_num,
                        'num_adp': n_adp,
                        'num_adj': n_adj,
                        'ratio_ne_tok': r_ne_tokens,
                        'ratio_noun_tok': r_noun_tokens,
                        'ratio_propn_tok': r_propn_tokens,
                        'ratio_all_noun_tok': r_all_noun_tokens,
                    }
                except Exception as e:
                    logger.error(f"Critical error occurred with caption of WikiCaps ID{df['wikicaps_id']}!")
                    logger.error(str(e))
                    return
                finally:
                    pb.update(1)
                    return d

            # FIXME why the hec is this using ALL AVAILABLE CORES?!
            metadata = dataframe.apply(__gen_polyglot_metadata_per_caption, axis=1, result_type='expand', args=(pbar,))
            res = pd.concat([dataframe, metadata], axis=1)
            res.convert_dtypes()

            logger.info(f"Finished adding caption statistics in {time.time() - start} seconds!")
            return res

    # compute the rations
    if pos_tag_stats:
        np_num_tok = np.array(num_tok)
        np_num_noun = np.array(num_noun)
        np_num_propn = np.array(num_propn)
        ratio_ne_tokens = (np.array(num_ne_tok) / np_num_tok)
        ratio_noun_tokens = (np_num_noun / np_num_tok)
        ratio_propn_tokens = (np_num_propn / np_num_tok)
        ratio_all_noun_tokens = ((np_num_noun + np_num_propn) / np_num_tok)

    res = dataframe.copy()

    # add stats as columns to df
    res['num_tok'] = num_tok

    res['num_sent'] = num_sent
    res['min_sent_len'] = min_sent_len
    res['max_sent_len'] = max_sent_len

    res['num_ne'] = num_ne
    res['ne_types'] = ne_types
    res['ne_texts'] = ne_texts

    if pos_tag_stats:
        res['num_nouns'] = num_noun
        res['num_propn'] = num_propn
        res['num_conj'] = num_conj
        res['num_verb'] = num_verb
        res['num_sym'] = num_sym
        res['num_num'] = num_num
        res['num_adp'] = num_adp
        res['num_adj'] = num_adj

        res['ratio_ne_tok'] = ratio_ne_tokens
        res['ratio_noun_tok'] = ratio_noun_tokens
        res['ratio_propn_tok'] = ratio_propn_tokens
        res['ratio_all_noun_tok'] = ratio_all_noun_tokens

    if readability_scores:
        res['fk_re_score'] = fk_re_score
        res['fk_gl_score'] = fk_gl_score
        res['dc_score'] = dc_score

    res.convert_dtypes()  # make sure that ints are not encoded as floats
    logger.info(f"Finished adding caption statistics in {time.time() - start} seconds!")

    return res