def strip_article(url): htmlcode = urllib2.urlopen(url).read().decode('utf-8') article = readability.Readability(htmlcode, url) #data = urllib2.urlopen(url).read() print article.content
def get_html(url): print url page = s.get(url, headers=headers, verify=False).text page = page.encode("utf-8") html = readability.Readability(page, url) filename = "./coolshell/" + html.getArticleTitle().encode( "utf-8") + ".html" print filename #filename = filename.decode("utf-8").encode("gbk","ignore") content = html.grabArticle().encode("utf-8") content = content.decode("utf-8").encode("gbk", "ignore") save_file(filename, content)
def LIXscore(text): rd = readability.Readability(text) score = rd.LIX() return float(score)
def ARIscore(text): rd = readability.Readability(text) score = rd.ARI() return float(score)
def computeRead(text): rd = readability.Readability(text) score = rd.FleschKincaidGradeLevel() return int(score)