Exemple #1
0
def add_article():
    url = request.vars.url
    board = cacher.get('board',long(request.vars.board))
    article = logic.get_article_by_url(url)

    if article is None:
        r = Readability()
        json = r.content(url)
        article = db.article.insert(
            url=json['url'],
            readability_url=json['short_url'],
            title=json['title'],
            #content=json['content'],
            domain=json['domain'],
            author=json['author'],
            excerpt=json['excerpt'],
            word_count=json['word_count'],
            total_pages=json['total_pages'],
            date_published=json['date_published'],
            next_page_id=json['next_page_id'],
            rendered_pages=json['rendered_pages'],            
        )
    
    pin = logic.add_pin(article, board)
    if request.vars.linkedin and request.env.http_host != '127.0.0.1:8080':
        logic.share_on_linkedin(session.linkedin, pin)

    return 'Success'
def get_metrics(commit, framework, sample, sample_path, udb_path):
    metrics = get_understand_metrics(framework, sample, udb_path, sample_path)
    metrics = get_necessary_metrics(metrics)
    metrics = adding_commit_data(commit, metrics)
    r = Readability(sample)
    readability = r.getReadability()
    del r
    metrics.append(readability)
    return metrics
Exemple #3
0
def run_FKGL(output_dir):
    with open(output_dir) as f:
        output = f.readlines()
        output = [d.lower().strip() for d in output]

    output_final = " ".join(output)
    rd = Readability(output_final)
    score = rd.FleschKincaidGradeLevel()
    return score
Exemple #4
0
def get_article(id):
    article = cacher.get('article', id)
    if not article.has_key('content'):
        #fetch content from readability in real time
        r = Readability()
        json = r.content(article['url'])
        article['content'] = json['content'].encode('UTF8', 'replace')
        cacher.set(article)

    return article
def main(argv):
    params = InputParamsHandler(argv)

    dom_doc = Network().get_dom_doc(params.get_url())
    reader = Readability(dom_doc)

    article = ""
    article += (reader.get_title() + "\n\n")
    article += reader.get_article()

    reader.save_content(params.get_dir_name(), params.get_article_name(), article)
 def __init__(self, text):
     self.readability = Readability(text)
     self.FLESCH_KINCAID = ['score', 'grade_level']
     self.FLESCH_EASE = ['score', 'ease', 'grade_level']
     self.DALE_CHALL = ['score', 'grade_level']
     self.ARI = ['score', 'grade_level', 'ages']
     self.CLI = ['score', 'grade_level']
     self.GUNNING_FOG = ['score', 'grade_level']
     self.SMOG = ['score', 'grade_level']
     self.SPACHE = ['score', 'grade_level']
     self.LINSEAR_WRITE = ['score', 'grade_level']
     self.values_index = self.initialize_value_index_array()
Exemple #7
0
def show_stat(text):
    rd = Readability(text)
    print 'Test text:'
    print '"%s"\n' % text
    print 'ARI: ', rd.ARI()
    print 'FleschReadingEase: ', rd.FleschReadingEase()
    print 'FleschKincaidGradeLevel: ', rd.FleschKincaidGradeLevel()
    print 'GunningFogIndex: ', rd.GunningFogIndex()
    print 'SMOGIndex: ', rd.SMOGIndex()
    print 'ColemanLiauIndex: ', rd.ColemanLiauIndex()
    print 'LIX: ', rd.LIX()
    print 'RIX: ', rd.RIX()
def get_fk_grade_level(text):
    # The text must contain at least 100 words
    if len(text.split()) < 100:
        result = "ERROR: This piece of text is too short to get a Flesch Kincaid grade level."
    else:
        # Instantiate a Readability object
        r = Readability(text)
        # Get the F-K score metric
        fk = r.flesch_kincaid()
        # Get the F-K grade level
        result = fk.grade_level
    return result
Exemple #9
0
    def test_smog(self):
        text = """
        In linguistics, the Gunning fog index is a readability test for English writing. The index estimates the years of formal education a person needs to understand the text on the first reading. For instance, a fog index of 12 requires the reading level of a United States high school senior (around 18 years old). The test was developed in 1952 by Robert Gunning, an American businessman who had been involved in newspaper and textbook publishing.
        The fog index is commonly used to confirm that text can be read easily by the intended audience. Texts for a wide audience generally need a fog index less than 12. Texts requiring near-universal understanding generally need an index less than 8.
        """
        text = ' '.join(text for i in range(0, 5))

        readability = Readability(text)
        r = readability.smog()

        print(r)
        self.assertEqual(12.516099999999998, r.score)
        self.assertEqual('13', r.grade_level)
Exemple #10
0
def get_read_stats(text):
    read = {}
    # readability stats
    rd = Readability(text)
    read['ari'] = rd.ARI()
    read['flesch_reading_ease'] = rd.FleschReadingEase()
    read['flesch_kincaid_grade_level'] = rd.FleschKincaidGradeLevel()
    read['gunning_fog_index'] = rd.GunningFogIndex()
    read['smog_index'] = rd.SMOGIndex()
    read['coleman_liau_index'] = rd.ColemanLiauIndex()
    read['lix'] = rd.LIX()
    read['rix'] = rd.RIX()
    return read
def analyse_document():
    document = sctxt.get(1.0, tk.END)
    read = Readability()
    sentence_count, word_count, syllable_count, index = \
        read.calculate_readability(document)

    global sentence_display
    sentence_display.set("Sentences: " + str(sentence_count))
    global word_display
    word_display.set("Words: " + str(word_count))
    global syllable_display
    syllable_display.set("Syllables: " + str(syllable_count))
    global index_display
    index_display.set("Index: " + "%6.2f" % index)
Exemple #12
0
def main():
    path='/home/shuo/Documents/AI_learning/LearningQ/data/teded/teded_crawled_data/'
    analysis=text_analysis()
    analysis.read_relation(path)
    #analysis.get_mintues()
    analysis.read_videoinfo(path)
    questions=analysis.gather_question()
    question=analysis.video_question
    #for item in question:
    #    print(question[item]['quizzes'][0].keys())
    """
    self.video_question[title]: video_link', 'video_title_length', 'video_description', 'quizzes', 'video_youtube_link
    quizzes: quiz_description', 'question_type', 'quiz_options', 'hint', 'answer'
    multiple-choices open-ended
    """
    scripts=analysis.gather_transcripts(path)
    stats_scripts(scripts)
    temp_dic=analysis.build_question_transcripts(path)

   # analysis.stats_scripts()
    temp=[]
    for item in temp_dic:
        for quiz in  temp_dic[item]['questions']:
            if quiz['question_type']=='multiple-choices':
                temp.append(temp_dic[item])
                break
    q=0
    for d in temp:
        for question in d['questions']:
            xxx=len(question['quiz_description'].split('.'))
            q+=xxx

    nlp = en_core_web_sm.load()
    #n_e=0
    total_r=0
    n=0
    for title in scripts:
        #sentences=scripts[title].split('\n')
        #e=NER(sentences,nlp)
        if len(scripts[title].split(' '))>=100:

            n+=1
            r = Readability(scripts[title])
            total_r+=r.flesch().score
        #n_e+=e
    #print(n_e)
    print(total_r)
    print(n)
    print(total_r/n)
Exemple #13
0
def readability(id):
    r = {}
    text = getDocContent(id)
    #print text
    rd = Readability(text)

    r["ARI"] = rd.ARI()
    r["FleschReadingEase"] = rd.FleschReadingEase()
    r["FleschKincaidGradeLevel"] = rd.FleschKincaidGradeLevel()
    r["RIX"] = rd.RIX()
    r["GunningFogIndex"] = rd.GunningFogIndex()
    r["SMOGIndex"] = rd.SMOGIndex()
    r["ColemanLiauIndex"] = rd.ColemanLiauIndex()
    r["LIX"] = rd.LIX()

    return r
Exemple #14
0
    def process(self, **kwargs):
        pageContent = kwargs['pageContent']
        url = kwargs['pageBaseUri']

        readability = Readability(pageContent, url)

        return readability.content
Exemple #15
0
    def parse(self, response):
        news = Readability(str(response.body.decode('utf8'))).parse()
        if not news['title']:
            print("Could not find the title!", response.url)
        else:
            # get category that i given [politics, economy, society, health, world, technology]
            news_category = response.meta.get('category', 'default')

            output = {**news, "ikon_category": news_category}
            pjoin = os.path.join
            file_path = pjoin('./corpuses_ikon', news_category)
            os.makedirs(file_path, exist_ok=True)
            with open(
                    pjoin(
                        file_path,
                        md5(news['title'].encode('utf-8')).hexdigest() +
                        ".json"), 'w') as outfile:
                json.dump(output, outfile, ensure_ascii=False)

        for next_page in response.xpath("//*[contains(@class, 'nlitem')]//a"):
            yield response.follow(
                next_page,
                self.parse,
                meta={'category': response.meta.get('category', 'default')})

        for next_page in response.xpath(
                "//*[contains(@class, 'ikon-right-dir')]/parent::a"):
            yield response.follow(
                next_page,
                self.parse,
                meta={'category': response.meta.get('category', 'default')})
    def analyze_text_by_single_method(self, text: str, method: str, attribute: str) -> Any:
        """
        Given a string representing the text of a document to be considered, the :func:`analyze_text_by_single_method`
        will process the text using one specific chosen method

        Parameters
        -----------
        text: `str`, required
            The input text to be processed

        method: `str`, required
            Method name

        attribute: `str`, required
            The attribute name (for example, score)

        Returns
        -----------
        The different grade_level and scores associated with each metric will be returned.
        """
        r = Readability(text)
        r_obj = getattr(r, method)()
        if r_obj is None:
            return "NA"
        else:
            return getattr(r_obj, attribute)
    def analyze_text_complexity(self, text: str) -> Dict[str, List[Any]]:
        """
        Given a string representing the text of a document to be considered, the :func:`analyze_text_complexity`
        computes different readability-related evaulations.

        Parameters
        -----------
        text: `str`, required
            The input text to be processed

        Returns
        -----------
        The different grade_level and scores associated with each metric will be returned.
        """
        output = dict()
        r = Readability(text)
        for met in self.meta.keys():
            try:
                r_obj = getattr(r, met)()
            except:
                r_obj = None
            for attr in self.meta[met]:
                key = "_".join([met, attr])
                if r_obj:
                    output[key] = [getattr(r_obj, attr)]
                else:
                    output[key] = ['NA']

        return output
 def graph_readability(self):
     x = pd.to_datetime(self.dataframe.date)
     y = self.dataframe.speech.map(
         lambda u: Readability(u).ari().score)
     z = self.dataframe.speech.map(
         lambda u: Readability(u).flesch().score)
     t = self.dataframe.speech.map(
         lambda u: Readability(u).gunning_fog().score)
     plt.figure()
     plt.plot(x, y, label='Coleman-Biau index')
     plt.plot(x, z, label='Flesch-Kincaid index')
     plt.plot(x, t, label='Gunning- Fog index')
     plt.xlabel('Year')
     plt.xticks(rotation=90)
     plt.ylabel('Readability')
     plt.legend()
     plt.show()
Exemple #19
0
def getReadableArticle(url):
    res = requests.get(url)
    if res.status_code != requests.codes.ok:
        return None
    rawHtml = res.text
    article = Readability(rawHtml,url)
    # if article is not None:
    #     with open(url.split('/')[-1].split('?')[0]+'.html', 'w+') as out:
    #         out.write(article.content)
    return article
def __dale_chall(r: Readability) -> float:
    try:
        lvls = r.dale_chall().grade_levels        
        if 'college_graduate' in lvls:
            return 17
        elif 'college' in lvls:
            return 13
        else:
            return stat.mean([float(lvl) for lvl in lvls])
    except ReadabilityException:
        return None
Exemple #21
0
def check_readability(filename):
    with codecs.open(filename, 'r', 'utf8') as f:
        html = f.read()
    with codecs.open(filename.replace('.html', '.txt'), 'r', 'utf8') as f:
        text = f.read()

    parser = Readability(html)
    article_text = parser.article.get_text()
    rate = distance_rate(article_text, text)
    print article_text
    print 'rate', rate
    assert rate > 0.85
Exemple #22
0
    def test_smog(self):
        text = """
        “On a June day sometime in the early 1990s, encouraged by his friend and fellow economist Jörgen Weibull, Abhijit went swimming in the Baltic. He leaped in and instantly jumped out—he claims that his teeth continued to chatter for the next three days. In 2018, also in June, we went to the Baltic in Stockholm, several hundred miles farther north than the previous encounter. This time it was literally child’s play; our children frolicked in the water.
        Wherever we went in Sweden, the unusually warm weather was a topic of conversation. It was probably a portent of something everyone felt, but for the moment it was hard not to be quite delighted with the new opportunities for outdoor life it offered.”. 
        """
        text = ' '.join(text for i in range(0, 5))

        readability = Readability(text)

        #Test SMOG with 30 sentences
        r1 = readability.smog()

        #Test SMOG with all sentences
        r2 = readability.smog(all_sentences=True)

        print("all_sentences=False: %s ; all_sentences=True: %s" % (r1, r2))
        self.assertEqual(12.516099999999998, r1.score)
        self.assertEqual('13', r1.grade_level)

        self.assertEqual(12.785403640627713, r2.score)
        self.assertEqual('13', r2.grade_level)
def __gunning_fog(r: Readability) -> float:
    try:
        lvl = r.gunning_fog().grade_level
        if lvl == 'college_graduate':
            return 17
        elif lvl == 'college':
            return 13
        elif lvl == 'na':
            return 0
        else:
            return float(lvl)
    except ReadabilityException:
        return None
def getReadability():
    authorFileNames = os.listdir(directory)
    texts = []
    authors = []
    truth = {}
    quote = []
    sents = []

    for file in authorFileNames:
        if file.endswith(".xml"):
            te = gettext(file)
            te.encode('ascii', 'ignore')
            texts.append(te)
            authors.append(file[:-4])
        else:
            fgh = open(directory + "/" + file, 'r')
            fg = fgh.read().split('\n')[:-1]
            for r in fg:
                df = r.split(':::')[1:]
                truth[r.split(':::')[0]] = df
            fgh.close()

    f = open('PANreadibility.csv', 'w')
    f.write(
        'ID,Gender,Age,ARI,FleschReadingEase,FleschKincaidGradeLevel,GunningFogIndex,SMOGIndex,ColemanLiauIndex,LIX,RIX\n'
    )
    for i in range(len(authors)):
        sf = texts[i]
        rd = Readability(sf.encode('ascii', 'ignore'))
        f.write(authors[i] + ',' + truth[authors[i]][0] + ',' +
                truth[authors[i]][1] + ',' + str(rd.ARI()) + ',' +
                str(rd.FleschReadingEase()) + ',' +
                str(rd.FleschKincaidGradeLevel()) + ',' +
                str(rd.GunningFogIndex()) + ',' + str(rd.SMOGIndex()) + ',' +
                str(rd.ColemanLiauIndex()) + ',' + str(rd.LIX()) + ',' +
                str(rd.RIX()) + '\n')

    f.close()
Exemple #25
0
 def doc_to_readability(doc_str) -> ArrayField:
     if len(doc_str) < 10:
         return ArrayField(np.zeros(7))
     str_to_read = doc_str
     try:
         while len(str_to_read.split()) < 150:
             str_to_read += " " + doc_str
         r = Readability(str_to_read)
         r_scores = [
             r.flesch_kincaid().score,
             r.flesch().score,
             r.gunning_fog().score,
             r.coleman_liau().score,
             r.dale_chall().score,
             r.ari().score,
             r.linsear_write().score
         ]
         return ArrayField(np.array(r_scores))
     except ReadabilityException:
         return ArrayField(np.zeros(7))
Exemple #26
0
    def run(self, book, **kwargs):
        doc = book.plaintext
        isbn = 'isbn' in book.metadata and book.metadata['isbn'][0]

        url = 'https://atlas-fab.lexile.com/free/books/' + str(isbn)

        headers = {'accept': 'application/json; version=1.0'}
        lexile = requests.get(url, headers=headers)
        # Checks if lexile exists for ISBN. If doesn't exist value remains 'None'.
        # If lexile does exist but no age range, value will be 'None'.
        # If no ISBN, value will be 'None'.
        if lexile.status_code == 200:
            lexile_work = lexile.json()['data']['work']
            self.lexile_min_age = str(lexile_work['min_age'])
            self.lexile_max_age = str(lexile_work['max_age'])
        try:
            r = Readability(doc)
            fk = r.flesch_kincaid()
            s = r.smog()
            self.readability_fk_score = fk.score
            self.readability_s_score = s.score
        # If less than 100 words
        except ReadabilityException:
            pass
def __calculate_sentences_median_grade_level(line: str) -> int:
    line = __fluff_line(line)
    r = Readability(line)
    grade_levels = [
        __ari(r),
        __coleman_liau(r),
        __dale_chall(r),
        __flesch_kincaid(r),
        __gunning_fog(r),    
        __linsear_write(r),        
        __smog(r),
        __spache(r)]
    grade_levels = [min(17, max(0, x)) for x in grade_levels if x != None]
    if len(grade_levels) == 0:
        return None
    grade_level = stat.median(grade_levels)
    return round(grade_level)
    def parse_article(self, response):
        news =  Readability(str(response.body.decode('utf8'))).parse()
        if not news['title']:
            print("Could not find the title!", response.url)
        else:
            # get category that i given [politics, economy, society, health, world, technology]
            news_category = response.meta.get('category', 'default')

            output = {
                **news,
                "ikon_category": news_category
            }
            pjoin = os.path.join
            file_path = pjoin('./corpuses_gogo', news_category)
            os.makedirs(file_path, exist_ok=True)
            with open(pjoin(file_path, md5(news['title'].encode('utf-8')).hexdigest()+".json"), 'w') as outfile:
                json.dump(output, outfile, ensure_ascii=False)
Exemple #29
0
def mapfunc(line, path_to_output):
    given_url = line.split(",")[2]

    if given_url.rsplit(".")[-1] == 'pdf|jpg|jpeg|png':
        print("FileFormatError,", given_url, file=sys.stderr)
        return

    try:
        htmlcode = urllib.request.urlopen(given_url).read().decode()
    except UnicodeDecodeError:
        print("UnicodeDecodeError,", given_url, file=sys.stderr)
        return
    except urllib.error.HTTPError:
        print("urllib.error.HTTPError,", given_url, file=sys.stderr)
        return
    except urllib.error.URLError:
        print("urllib.error.URLError,", given_url, file=sys.stderr)
        return
    except ConnectionResetError:
        print("ConnectionReseterror,", given_url, file=sys.stderr)
        return
    except ssl.CertificateError:
        print("ssl.CertificateError,", given_url, file=sys.stderr)
        return

    try:
        body_html = Readability(htmlcode, given_url).content
    except KeyError:
        print("KeyError,", given_url, file=sys.stderr)
        return

    body_removetag = bs4.BeautifulSoup(body_html, "lxml").text.replace(
        '\n', '').replace(',', '')

    row = [given_url, body_removetag]
    # row = [given_url, body_html, body_removetag]
    # print(",".join(row))

    with open(path_to_output, "a") as output:
        output.write(",".join(row) + "\n")
Exemple #30
0
        ))
    st.plotly_chart(fig)


#------------------
# Readability
#------------------

st.header('Readability')

# Context 
passage = st.text_area("Candidate Bible Passage (English)", value='', 
        max_chars=None, key='readability_passage')

# Calculate readability
r = Readability(passage)

# Display readability
data = [
        ['Flesch-Kincaid Score', r.flesch_kincaid().score],
        ['Flesch Reading Ease', r.flesch().ease],
        ['Dale Chall Readability Score', r.dale_chall().score],
        ['Automated Readability Index Score', r.ari().score],
        ['Coleman Liau Index', r.coleman_liau().score],
        ['Gunning Fog', r.gunning_fog().score],
        ['Linsear Write', r.linsear_write().score],
        ['Spache Readability Formula', r.spache().score]
        ]
df = pd.DataFrame(data, columns=['Readability Metric', 'Value'])
if st.button('Assess Readability', key=None):
    st.write(df)
Exemple #31
0
 def readability(self, text):
     rd = Readability(text)
     fkg_score = rd.FleschKincaidGradeLevel()
     SMOG = rd.SMOGIndex()
     return fkg_score, SMOG
Exemple #32
0
print((cutup_pieces_list))

#to see number of possible Permutations(different combinations of list enteries without any repetitions)
#e.g in case of A,B,C . Permutations ABC, ACB, BAC,BCA, CAB, CBA  i.e. 6 combinations for 3 columns/list enteries
num_combinations=math.factorial(num_columns)

"""Finding All Cut Up Permutations and Storage in a list """
permutations_object = itertools.permutations(cutup_pieces_list) #Find permutations of a_list.
permutations_list = list(permutations_object)        #Create list from permutations.
permuted_strings_list=["".join(tup) for tup in permutations_list]
#print(permuted_strings_list)

readability_index_list=[]

for i in range(num_combinations):
    r = Readability(permuted_strings_list[i])
    dc = r.dale_chall()
    readability_index_list.append(dc.score)
    #print(dc.grade_levels)

#Finding index of list with highest readability score to extract text which is most readable
index=readability_index_list.index(max(readability_index_list))
print(permuted_strings_list[index])
#    print(permuted_strings_list[i])
"""Dale Chall Readability
The Dale-Chall Formula is an accurate readability formula for the simple reason that it is based on the use of
familiar words, rather than syllable or letter counts. Reading tests show that readers usually find it easier 
to read, process and recall a passage if they find the words familiar."""


Exemple #33
0
def get_smog(text):
	txt = Readability(text)
	try:
		return txt.SMOGIndex()
	except ZeroDivisionError:
		return 0
Exemple #34
0
    def parseDocument(self, doc):
        doc = pq(doc);

	wrapparent = self.articleRule.wrapparent
	pageparent = self.articleRule.pageparent
	content_re = "";
	#子页面url
	urls = []

	#文本数据内容
	content = ""

	article = doc.find(wrapparent);
	#pages
	if pageparent:
	    urls = self.parsePage(article, pageparent)
	#need title, tags
	extrarules = self.articleRule.extrarules

	#只有文章是有content
        #TODO: 这里目前缺失一些特性
	if len(extrarules):
	    for key, rule, fetch_all, page_type in extrarules:
		field = Field(name = key, rule=rule);
		value = getElementData(doc, rule, self.data["images"], fetch_all)

		self.data[field.get('name')] = field

		if self.is_article_content(field):
		    content_re = field.get("rule")
		    content = value
		elif self.is_gallery_content(field):
		    content_re = field.get("rule")
		    content = []
		    if (isinstance(value, list)):
			content += value
		else:
		    field.value = value

	#采集分页内容
	if len(urls) > 0 and content_re:
	    for next_url in urls:
		next_page = Fetch(next_url, charset = self.seed["charset"], timeout = self.seed["timeout"]).read()
		if next_page is not None:
		    next_page = self._getContent(next_page, wrapparent, content_re);
		    if next_page:
			if isinstance(content, list):
			    content.append(next_page)
			else:
			    content += next_page

	if content and content_re:
	    if isinstance(content, list):
		self.data['content'].value = content
		self.data['images'] += content
	    else:
		content = Readability(content, self.url, self.articleRule.filters)
		images = content.getImages();

		self.data['content'].value = content.getContent();
		self.data['images'] += images