Beispiel #1
0
    def process(self, **kwargs):
        pageContent = kwargs['pageContent']
        url = kwargs['pageBaseUri']

        readability = Readability(pageContent, url)

        return readability.content
    def analyze_text_complexity(self, text: str) -> Dict[str, List[Any]]:
        """
        Given a string representing the text of a document to be considered, the :func:`analyze_text_complexity`
        computes different readability-related evaulations.

        Parameters
        -----------
        text: `str`, required
            The input text to be processed

        Returns
        -----------
        The different grade_level and scores associated with each metric will be returned.
        """
        output = dict()
        r = Readability(text)
        for met in self.meta.keys():
            try:
                r_obj = getattr(r, met)()
            except:
                r_obj = None
            for attr in self.meta[met]:
                key = "_".join([met, attr])
                if r_obj:
                    output[key] = [getattr(r_obj, attr)]
                else:
                    output[key] = ['NA']

        return output
Beispiel #3
0
    def parse(self, response):
        news = Readability(str(response.body.decode('utf8'))).parse()
        if not news['title']:
            print("Could not find the title!", response.url)
        else:
            # get category that i given [politics, economy, society, health, world, technology]
            news_category = response.meta.get('category', 'default')

            output = {**news, "ikon_category": news_category}
            pjoin = os.path.join
            file_path = pjoin('./corpuses_ikon', news_category)
            os.makedirs(file_path, exist_ok=True)
            with open(
                    pjoin(
                        file_path,
                        md5(news['title'].encode('utf-8')).hexdigest() +
                        ".json"), 'w') as outfile:
                json.dump(output, outfile, ensure_ascii=False)

        for next_page in response.xpath("//*[contains(@class, 'nlitem')]//a"):
            yield response.follow(
                next_page,
                self.parse,
                meta={'category': response.meta.get('category', 'default')})

        for next_page in response.xpath(
                "//*[contains(@class, 'ikon-right-dir')]/parent::a"):
            yield response.follow(
                next_page,
                self.parse,
                meta={'category': response.meta.get('category', 'default')})
    def analyze_text_by_single_method(self, text: str, method: str, attribute: str) -> Any:
        """
        Given a string representing the text of a document to be considered, the :func:`analyze_text_by_single_method`
        will process the text using one specific chosen method

        Parameters
        -----------
        text: `str`, required
            The input text to be processed

        method: `str`, required
            Method name

        attribute: `str`, required
            The attribute name (for example, score)

        Returns
        -----------
        The different grade_level and scores associated with each metric will be returned.
        """
        r = Readability(text)
        r_obj = getattr(r, method)()
        if r_obj is None:
            return "NA"
        else:
            return getattr(r_obj, attribute)
 def graph_readability(self):
     x = pd.to_datetime(self.dataframe.date)
     y = self.dataframe.speech.map(
         lambda u: Readability(u).ari().score)
     z = self.dataframe.speech.map(
         lambda u: Readability(u).flesch().score)
     t = self.dataframe.speech.map(
         lambda u: Readability(u).gunning_fog().score)
     plt.figure()
     plt.plot(x, y, label='Coleman-Biau index')
     plt.plot(x, z, label='Flesch-Kincaid index')
     plt.plot(x, t, label='Gunning- Fog index')
     plt.xlabel('Year')
     plt.xticks(rotation=90)
     plt.ylabel('Readability')
     plt.legend()
     plt.show()
Beispiel #6
0
def run_FKGL(output_dir):
    with open(output_dir) as f:
        output = f.readlines()
        output = [d.lower().strip() for d in output]

    output_final = " ".join(output)
    rd = Readability(output_final)
    score = rd.FleschKincaidGradeLevel()
    return score
def get_metrics(commit, framework, sample, sample_path, udb_path):
    metrics = get_understand_metrics(framework, sample, udb_path, sample_path)
    metrics = get_necessary_metrics(metrics)
    metrics = adding_commit_data(commit, metrics)
    r = Readability(sample)
    readability = r.getReadability()
    del r
    metrics.append(readability)
    return metrics
Beispiel #8
0
def getReadableArticle(url):
    res = requests.get(url)
    if res.status_code != requests.codes.ok:
        return None
    rawHtml = res.text
    article = Readability(rawHtml,url)
    # if article is not None:
    #     with open(url.split('/')[-1].split('?')[0]+'.html', 'w+') as out:
    #         out.write(article.content)
    return article
Beispiel #9
0
def check_readability(filename):
    with codecs.open(filename, 'r', 'utf8') as f:
        html = f.read()
    with codecs.open(filename.replace('.html', '.txt'), 'r', 'utf8') as f:
        text = f.read()

    parser = Readability(html)
    article_text = parser.article.get_text()
    rate = distance_rate(article_text, text)
    print article_text
    print 'rate', rate
    assert rate > 0.85
 def __init__(self, text):
     self.readability = Readability(text)
     self.FLESCH_KINCAID = ['score', 'grade_level']
     self.FLESCH_EASE = ['score', 'ease', 'grade_level']
     self.DALE_CHALL = ['score', 'grade_level']
     self.ARI = ['score', 'grade_level', 'ages']
     self.CLI = ['score', 'grade_level']
     self.GUNNING_FOG = ['score', 'grade_level']
     self.SMOG = ['score', 'grade_level']
     self.SPACHE = ['score', 'grade_level']
     self.LINSEAR_WRITE = ['score', 'grade_level']
     self.values_index = self.initialize_value_index_array()
def get_fk_grade_level(text):
    # The text must contain at least 100 words
    if len(text.split()) < 100:
        result = "ERROR: This piece of text is too short to get a Flesch Kincaid grade level."
    else:
        # Instantiate a Readability object
        r = Readability(text)
        # Get the F-K score metric
        fk = r.flesch_kincaid()
        # Get the F-K grade level
        result = fk.grade_level
    return result
Beispiel #12
0
def show_stat(text):
    rd = Readability(text)
    print 'Test text:'
    print '"%s"\n' % text
    print 'ARI: ', rd.ARI()
    print 'FleschReadingEase: ', rd.FleschReadingEase()
    print 'FleschKincaidGradeLevel: ', rd.FleschKincaidGradeLevel()
    print 'GunningFogIndex: ', rd.GunningFogIndex()
    print 'SMOGIndex: ', rd.SMOGIndex()
    print 'ColemanLiauIndex: ', rd.ColemanLiauIndex()
    print 'LIX: ', rd.LIX()
    print 'RIX: ', rd.RIX()
Beispiel #13
0
def get_read_stats(text):
    read = {}
    # readability stats
    rd = Readability(text)
    read['ari'] = rd.ARI()
    read['flesch_reading_ease'] = rd.FleschReadingEase()
    read['flesch_kincaid_grade_level'] = rd.FleschKincaidGradeLevel()
    read['gunning_fog_index'] = rd.GunningFogIndex()
    read['smog_index'] = rd.SMOGIndex()
    read['coleman_liau_index'] = rd.ColemanLiauIndex()
    read['lix'] = rd.LIX()
    read['rix'] = rd.RIX()
    return read
Beispiel #14
0
    def test_smog(self):
        text = """
        In linguistics, the Gunning fog index is a readability test for English writing. The index estimates the years of formal education a person needs to understand the text on the first reading. For instance, a fog index of 12 requires the reading level of a United States high school senior (around 18 years old). The test was developed in 1952 by Robert Gunning, an American businessman who had been involved in newspaper and textbook publishing.
        The fog index is commonly used to confirm that text can be read easily by the intended audience. Texts for a wide audience generally need a fog index less than 12. Texts requiring near-universal understanding generally need an index less than 8.
        """
        text = ' '.join(text for i in range(0, 5))

        readability = Readability(text)
        r = readability.smog()

        print(r)
        self.assertEqual(12.516099999999998, r.score)
        self.assertEqual('13', r.grade_level)
def analyse_document():
    document = sctxt.get(1.0, tk.END)
    read = Readability()
    sentence_count, word_count, syllable_count, index = \
        read.calculate_readability(document)

    global sentence_display
    sentence_display.set("Sentences: " + str(sentence_count))
    global word_display
    word_display.set("Words: " + str(word_count))
    global syllable_display
    syllable_display.set("Syllables: " + str(syllable_count))
    global index_display
    index_display.set("Index: " + "%6.2f" % index)
Beispiel #16
0
def main():
    path='/home/shuo/Documents/AI_learning/LearningQ/data/teded/teded_crawled_data/'
    analysis=text_analysis()
    analysis.read_relation(path)
    #analysis.get_mintues()
    analysis.read_videoinfo(path)
    questions=analysis.gather_question()
    question=analysis.video_question
    #for item in question:
    #    print(question[item]['quizzes'][0].keys())
    """
    self.video_question[title]: video_link', 'video_title_length', 'video_description', 'quizzes', 'video_youtube_link
    quizzes: quiz_description', 'question_type', 'quiz_options', 'hint', 'answer'
    multiple-choices open-ended
    """
    scripts=analysis.gather_transcripts(path)
    stats_scripts(scripts)
    temp_dic=analysis.build_question_transcripts(path)

   # analysis.stats_scripts()
    temp=[]
    for item in temp_dic:
        for quiz in  temp_dic[item]['questions']:
            if quiz['question_type']=='multiple-choices':
                temp.append(temp_dic[item])
                break
    q=0
    for d in temp:
        for question in d['questions']:
            xxx=len(question['quiz_description'].split('.'))
            q+=xxx

    nlp = en_core_web_sm.load()
    #n_e=0
    total_r=0
    n=0
    for title in scripts:
        #sentences=scripts[title].split('\n')
        #e=NER(sentences,nlp)
        if len(scripts[title].split(' '))>=100:

            n+=1
            r = Readability(scripts[title])
            total_r+=r.flesch().score
        #n_e+=e
    #print(n_e)
    print(total_r)
    print(n)
    print(total_r/n)
Beispiel #17
0
def readability(id):
    r = {}
    text = getDocContent(id)
    #print text
    rd = Readability(text)

    r["ARI"] = rd.ARI()
    r["FleschReadingEase"] = rd.FleschReadingEase()
    r["FleschKincaidGradeLevel"] = rd.FleschKincaidGradeLevel()
    r["RIX"] = rd.RIX()
    r["GunningFogIndex"] = rd.GunningFogIndex()
    r["SMOGIndex"] = rd.SMOGIndex()
    r["ColemanLiauIndex"] = rd.ColemanLiauIndex()
    r["LIX"] = rd.LIX()

    return r
    def parse_article(self, response):
        news =  Readability(str(response.body.decode('utf8'))).parse()
        if not news['title']:
            print("Could not find the title!", response.url)
        else:
            # get category that i given [politics, economy, society, health, world, technology]
            news_category = response.meta.get('category', 'default')

            output = {
                **news,
                "ikon_category": news_category
            }
            pjoin = os.path.join
            file_path = pjoin('./corpuses_gogo', news_category)
            os.makedirs(file_path, exist_ok=True)
            with open(pjoin(file_path, md5(news['title'].encode('utf-8')).hexdigest()+".json"), 'w') as outfile:
                json.dump(output, outfile, ensure_ascii=False)
def __calculate_sentences_median_grade_level(line: str) -> int:
    line = __fluff_line(line)
    r = Readability(line)
    grade_levels = [
        __ari(r),
        __coleman_liau(r),
        __dale_chall(r),
        __flesch_kincaid(r),
        __gunning_fog(r),    
        __linsear_write(r),        
        __smog(r),
        __spache(r)]
    grade_levels = [min(17, max(0, x)) for x in grade_levels if x != None]
    if len(grade_levels) == 0:
        return None
    grade_level = stat.median(grade_levels)
    return round(grade_level)
Beispiel #20
0
 def doc_to_readability(doc_str) -> ArrayField:
     if len(doc_str) < 10:
         return ArrayField(np.zeros(7))
     str_to_read = doc_str
     try:
         while len(str_to_read.split()) < 150:
             str_to_read += " " + doc_str
         r = Readability(str_to_read)
         r_scores = [
             r.flesch_kincaid().score,
             r.flesch().score,
             r.gunning_fog().score,
             r.coleman_liau().score,
             r.dale_chall().score,
             r.ari().score,
             r.linsear_write().score
         ]
         return ArrayField(np.array(r_scores))
     except ReadabilityException:
         return ArrayField(np.zeros(7))
Beispiel #21
0
    def test_smog(self):
        text = """
        “On a June day sometime in the early 1990s, encouraged by his friend and fellow economist Jörgen Weibull, Abhijit went swimming in the Baltic. He leaped in and instantly jumped out—he claims that his teeth continued to chatter for the next three days. In 2018, also in June, we went to the Baltic in Stockholm, several hundred miles farther north than the previous encounter. This time it was literally child’s play; our children frolicked in the water.
        Wherever we went in Sweden, the unusually warm weather was a topic of conversation. It was probably a portent of something everyone felt, but for the moment it was hard not to be quite delighted with the new opportunities for outdoor life it offered.”. 
        """
        text = ' '.join(text for i in range(0, 5))

        readability = Readability(text)

        #Test SMOG with 30 sentences
        r1 = readability.smog()

        #Test SMOG with all sentences
        r2 = readability.smog(all_sentences=True)

        print("all_sentences=False: %s ; all_sentences=True: %s" % (r1, r2))
        self.assertEqual(12.516099999999998, r1.score)
        self.assertEqual('13', r1.grade_level)

        self.assertEqual(12.785403640627713, r2.score)
        self.assertEqual('13', r2.grade_level)
Beispiel #22
0
def mapfunc(line, path_to_output):
    given_url = line.split(",")[2]

    if given_url.rsplit(".")[-1] == 'pdf|jpg|jpeg|png':
        print("FileFormatError,", given_url, file=sys.stderr)
        return

    try:
        htmlcode = urllib.request.urlopen(given_url).read().decode()
    except UnicodeDecodeError:
        print("UnicodeDecodeError,", given_url, file=sys.stderr)
        return
    except urllib.error.HTTPError:
        print("urllib.error.HTTPError,", given_url, file=sys.stderr)
        return
    except urllib.error.URLError:
        print("urllib.error.URLError,", given_url, file=sys.stderr)
        return
    except ConnectionResetError:
        print("ConnectionReseterror,", given_url, file=sys.stderr)
        return
    except ssl.CertificateError:
        print("ssl.CertificateError,", given_url, file=sys.stderr)
        return

    try:
        body_html = Readability(htmlcode, given_url).content
    except KeyError:
        print("KeyError,", given_url, file=sys.stderr)
        return

    body_removetag = bs4.BeautifulSoup(body_html, "lxml").text.replace(
        '\n', '').replace(',', '')

    row = [given_url, body_removetag]
    # row = [given_url, body_html, body_removetag]
    # print(",".join(row))

    with open(path_to_output, "a") as output:
        output.write(",".join(row) + "\n")
def getReadability():
    authorFileNames = os.listdir(directory)
    texts = []
    authors = []
    truth = {}
    quote = []
    sents = []

    for file in authorFileNames:
        if file.endswith(".xml"):
            te = gettext(file)
            te.encode('ascii', 'ignore')
            texts.append(te)
            authors.append(file[:-4])
        else:
            fgh = open(directory + "/" + file, 'r')
            fg = fgh.read().split('\n')[:-1]
            for r in fg:
                df = r.split(':::')[1:]
                truth[r.split(':::')[0]] = df
            fgh.close()

    f = open('PANreadibility.csv', 'w')
    f.write(
        'ID,Gender,Age,ARI,FleschReadingEase,FleschKincaidGradeLevel,GunningFogIndex,SMOGIndex,ColemanLiauIndex,LIX,RIX\n'
    )
    for i in range(len(authors)):
        sf = texts[i]
        rd = Readability(sf.encode('ascii', 'ignore'))
        f.write(authors[i] + ',' + truth[authors[i]][0] + ',' +
                truth[authors[i]][1] + ',' + str(rd.ARI()) + ',' +
                str(rd.FleschReadingEase()) + ',' +
                str(rd.FleschKincaidGradeLevel()) + ',' +
                str(rd.GunningFogIndex()) + ',' + str(rd.SMOGIndex()) + ',' +
                str(rd.ColemanLiauIndex()) + ',' + str(rd.LIX()) + ',' +
                str(rd.RIX()) + '\n')

    f.close()
Beispiel #24
0
    def run(self, book, **kwargs):
        doc = book.plaintext
        isbn = 'isbn' in book.metadata and book.metadata['isbn'][0]

        url = 'https://atlas-fab.lexile.com/free/books/' + str(isbn)

        headers = {'accept': 'application/json; version=1.0'}
        lexile = requests.get(url, headers=headers)
        # Checks if lexile exists for ISBN. If doesn't exist value remains 'None'.
        # If lexile does exist but no age range, value will be 'None'.
        # If no ISBN, value will be 'None'.
        if lexile.status_code == 200:
            lexile_work = lexile.json()['data']['work']
            self.lexile_min_age = str(lexile_work['min_age'])
            self.lexile_max_age = str(lexile_work['max_age'])
        try:
            r = Readability(doc)
            fk = r.flesch_kincaid()
            s = r.smog()
            self.readability_fk_score = fk.score
            self.readability_s_score = s.score
        # If less than 100 words
        except ReadabilityException:
            pass
Beispiel #25
0
        ))
    st.plotly_chart(fig)


#------------------
# Readability
#------------------

st.header('Readability')

# Context 
passage = st.text_area("Candidate Bible Passage (English)", value='', 
        max_chars=None, key='readability_passage')

# Calculate readability
r = Readability(passage)

# Display readability
data = [
        ['Flesch-Kincaid Score', r.flesch_kincaid().score],
        ['Flesch Reading Ease', r.flesch().ease],
        ['Dale Chall Readability Score', r.dale_chall().score],
        ['Automated Readability Index Score', r.ari().score],
        ['Coleman Liau Index', r.coleman_liau().score],
        ['Gunning Fog', r.gunning_fog().score],
        ['Linsear Write', r.linsear_write().score],
        ['Spache Readability Formula', r.spache().score]
        ]
df = pd.DataFrame(data, columns=['Readability Metric', 'Value'])
if st.button('Assess Readability', key=None):
    st.write(df)
Beispiel #26
0
 def readability(self, text):
     rd = Readability(text)
     fkg_score = rd.FleschKincaidGradeLevel()
     SMOG = rd.SMOGIndex()
     return fkg_score, SMOG
Beispiel #27
0
print((cutup_pieces_list))

#to see number of possible Permutations(different combinations of list enteries without any repetitions)
#e.g in case of A,B,C . Permutations ABC, ACB, BAC,BCA, CAB, CBA  i.e. 6 combinations for 3 columns/list enteries
num_combinations=math.factorial(num_columns)

"""Finding All Cut Up Permutations and Storage in a list """
permutations_object = itertools.permutations(cutup_pieces_list) #Find permutations of a_list.
permutations_list = list(permutations_object)        #Create list from permutations.
permuted_strings_list=["".join(tup) for tup in permutations_list]
#print(permuted_strings_list)

readability_index_list=[]

for i in range(num_combinations):
    r = Readability(permuted_strings_list[i])
    dc = r.dale_chall()
    readability_index_list.append(dc.score)
    #print(dc.grade_levels)

#Finding index of list with highest readability score to extract text which is most readable
index=readability_index_list.index(max(readability_index_list))
print(permuted_strings_list[index])
#    print(permuted_strings_list[i])
"""Dale Chall Readability
The Dale-Chall Formula is an accurate readability formula for the simple reason that it is based on the use of
familiar words, rather than syllable or letter counts. Reading tests show that readers usually find it easier 
to read, process and recall a passage if they find the words familiar."""


Beispiel #28
0
def get_smog(text):
	txt = Readability(text)
	try:
		return txt.SMOGIndex()
	except ZeroDivisionError:
		return 0
Beispiel #29
0
def get_readability(text):
	txt = Readability(text)
	return txt.FleschReadingEase()
Beispiel #30
0
def fetch_Data_pushshift(Search_User,Search_Subs_List):
    Fetch_Data = {}
    _c_comment_lengths = {}
    _c_comment_texts = {}
    logger.debug("fetch_Data_pushshift user=%s" % Search_User)

    c_count = 0
    comments = get_author_comments_pushshift(author=Search_User,size=1000,sort='desc',sort_type='created_utc')
    for comment in comments:
        commentsub=comment['subreddit'].lower()
        if commentsub in Search_Subs_List:
            if commentsub not in Fetch_Data:
                Fetch_Data[commentsub] = {}
                Fetch_Data[commentsub]['c_karma'] = 0
                Fetch_Data[commentsub]['c_count'] = 0
                Fetch_Data[commentsub]['s_karma'] = 0
                Fetch_Data[commentsub]['s_count'] = 0
                _c_comment_texts[commentsub] = ""
                _c_comment_lengths[commentsub] = []

            Fetch_Data[commentsub]['c_karma'] += comment['score']
            Fetch_Data[commentsub]['c_count'] += 1
            _c_comment_texts[commentsub] += comment.body
            _c_length = len(comment.body.split())
            _c_comment_lengths[commentsub].append(_c_length)

    s_count = 0
    submissions = get_author_submissions_pushshift(author=Search_User,size=1000,sort='desc',sort_type='created_utc')
    for submit in submissions:
        if 'subreddit' in submit:
            submitsub=submit['subreddit'].lower()
            if submitsub in Search_Subs_List:
                if submitsub not in Fetch_Data:
                    Fetch_Data[submitsub] = {}
                    Fetch_Data[submitsub]['c_karma'] = 0
                    Fetch_Data[submitsub]['c_count'] = 0
                    Fetch_Data[submitsub]['s_karma'] = 0
                    Fetch_Data[submitsub]['s_count'] = 0
                Fetch_Data[submitsub]['s_karma'] += submit['score']
                Fetch_Data[submitsub]['s_count'] += 1

    # Process comment data
    for sreddit in Fetch_Data:
        if sreddit in _c_comment_texts:
            words = nltk.word_tokenize(_c_comment_texts[sreddit])
            words = [ word for word in words if len(word) > 3]
            words = [ word.lower() for word in words ]
            words = [ word for word in words if word not in default_stopwords ]
            words = [ word for word in words if word not in string.punctuation ]
        else:
            words = nltk.word_tokenize('')
        fdist = nltk.FreqDist(words)
        wordlist = []
        for topword, frequency in fdist.most_common(3):
            wordlist.append(topword)
        topwords = ', '.join(wordlist)
        Fetch_Data[sreddit]['top_words'] = topwords
        if sreddit in _c_comment_lengths:
            Fetch_Data[sreddit]['c_median_length'] = statistics.median(_c_comment_lengths[sreddit])
            if len(words) > 100:
                r = Readability(_c_comment_texts[sreddit])
                Fetch_Data[sreddit]['grade_level'] = r.ari().grade_levels[0]
            else:
                Fetch_Data[sreddit]['grade_level'] = ''
        else:
            Fetch_Data[sreddit]['c_median_length'] = 0
            Fetch_Data[sreddit]['grade_level'] = ''


    # mark other subs searched as empty
    for sreddit in Search_Subs_List:
        if sreddit not in Fetch_Data:
            Fetch_Data[sreddit] = {}
            Fetch_Data[sreddit]['c_karma'] = 0
            Fetch_Data[sreddit]['c_count'] = 0
            Fetch_Data[sreddit]['s_karma'] = 0
            Fetch_Data[sreddit]['s_count'] = 0
            Fetch_Data[sreddit]['c_median_length'] = 0
            Fetch_Data[sreddit]['top_words'] = ''
            Fetch_Data[sreddit]['grade_level'] = ''

    return Fetch_Data