def convert(link): """ use burify's readability implementation to transcode a web page and return the transcoded page and images found in it """ if not link: logger.error('Cannot transcode nothing!') return None, None, None try: data = transcoder.prepare_link(link) if data: article = Document(data) if article: images, content = _collect_images( article.summary(html_partial=False), link) return article.short_title(), content, images else: logger.info('Burify cannot recognize the data') return None, None, None else: logger.info('Cannot parse %s correctly' % link) return None, None, None except Exception as k: logger.error('%s for %s' % (str(k), str(link))) return None, None, None
def htmlParser(htmlContent): """ An HTML Parser for http:/abc.net.au """ doc = Document(htmlContent) title = doc.short_title() simple_html = doc.summary(True) simple_sel = Selector(text=simple_html) unclean_body = '\n'.join(simple_sel.xpath('//text()').extract()) body = _clean_body(unclean_body) global_sel = Selector(text=htmlContent) time_published = _first_post_time(global_sel) author = _author(global_sel) keywords = gen_keywords(body) return { "title": title, "body": body, "author": author, "timePublished": time_published, "keywords": keywords }
def preliminary_parse(self): if (not self.is_downloaded): raise Exception("not downloaded") try: d = Document(self.html) self._readability_title = d.short_title() self._readability_text = d.summary() logging.debug(u"readability title: {0}".format( repr(self._readability_title))) logging.debug(u"readability text: {0}".format( repr(self._readability_text))) if (self._readability_title and self._readability_text): self.is_parsed = True return True except Exception as e: logging.warning("error while doing readability parse: {0}".format( str(e))) return False logging.debug("falling back to newspaper parse") self.newspaper_article.parse() logging.debug(u"newspaper title: {0}".format( repr(self._newspaper_title))) logging.debug(u"newspaper text: {0}".format(repr( self._newspaper_text))) self.is_parsed = True return True
def parse_story(self, response): doc = Document(response.text) story = NewsItem() story['url'] = response.url story['headline'] = doc.short_title() story['body'] = doc.summary() yield story
def html2text(url: str) -> str: request = urllib.request.Request( url, headers={ 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0' }) html = urllib.request.urlopen(request).read() doc = Document(html) cleaned = "<h2>" + doc.short_title() + "</h2><br/>" + doc.summary() soup = BeautifulSoup(cleaned, features="html.parser") # kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out # get text text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = "\n".join(chunk for chunk in chunks if chunk) return text
def parse_article(article_url): """Parse an online article.""" source = requests.get(article_url, verify=True, timeout=2) urlparsed = urlparse(article_url) hostname = "{scheme}://{netloc}".format(scheme=urlparsed.scheme, netloc=urlparsed.netloc) doc = Document(source.text) content = bleach.clean(doc.summary(), tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRS, strip=True) soup = BeautifulSoup(content) for img in soup.findAll("img"): if img["src"].startswith("http"): continue img["src"] = "{root}/{src}".format(root=hostname, src=img["src"]) for link in soup.findAll("link"): if link["href"].startswith("http"): continue link["href"] = "{root}/{src}".format(root=hostname, src=img["href"]) return { "title": doc.short_title(), "content": str(soup), "url": article_url }
def parse(self, response): doc = Document(response.text) yield { 'url': response.url, 'short_title': doc.short_title(), 'summary': doc.summary(html_partial=True), } for next_page in response.css('a::attr("href")'): yield response.follow(next_page, self.parse)
def load_alt_engine(self): response = requests.get(self.url) doc = Document(response.text) txt = doc.summary() soup = BeautifulSoup(txt, 'html.parser') self.text = soup.get_text() self.html = response.text self.title = doc.short_title()
def html_filter(url, text): try: # readability doc = Document(text) title = doc.short_title() html_sum = doc.summary(html_partial=True) # pandoc try: try: doc = pf.convert_text( html_sum, input_format='html-native_divs-native_spans', standalone=True) # strange error, pandoc and pypandoc can take this without problem. except OSError as e: print( 'Error {} from panflute encountered when processing {}, fallback to pypandoc.' .format(e, url)) doc = pf.convert_text(pypandoc.convert_text( html_sum, 'json', 'html-native_divs-native_spans'), input_format='json', standalone=True) except JSONDecodeError as e: print( 'Error {} from panflute encountered when processing {}, fallback to pypandoc.' .format(e, url)) doc = pf.convert_text( pypandoc.convert_text(html_sum, 'html', 'html-native_divs-native_spans'), input_format='html-native_divs-native_spans', standalone=True) doc = pf.run_filters((increase_header_level, Image_to_Link), doc=doc) temp = pf.convert_text('''# {}\n\n[Source]({})'''.format( title, url)) for item in temp[::-1]: doc.content.insert(0, item) return pf.convert_text(doc, input_format='panflute', output_format='html') except: print( 'Cannot handle error from panflute, stop using pandoc filter on {}.' .format(url)) return '<h1>{}</h1><p><a href="{}">Source</a></p>{}'.format( title, url, html_sum) except Exception as e: print('Cannot handle error {}. Skip processing {}.'.format(e, url)) return ''
def parse_item(self, response): filename = hashlib.sha1(response.url.encode()).hexdigest() readability_document = Document(response.body, url=response.url) item = BeerReviewPage() item['url'] = response.url item['filename'] = filename item['depth'] = response.meta['depth'] item['link_text'] = response.meta['link_text'] item['title'] = readability_document.short_title() with open('data/' + filename + '.html','wb') as html_file: html_file.write(readability_document.content()) print '(' + filename + ') ' + item['title'] + " : " + item['url'] return item
def extract_article(html, title=None): """ Wraps around readability.Document and returns the articles title and content. """ doc = Document(html, negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS) doc_title = doc.short_title() # invoke the summary method to invoke readability's magic doc.summary(html_partial=True) # obtain the article as HtmlElement tree: html_tree = doc.html # clean up the article html: clean_html = cleanup(html_tree, doc_title) # check if the outer element is a tag from negative_keywords if elem_attr_contain(clean_html, settings.ARTEX_NEGATIVE_KEYWORDS): bad_attr = True else: bad_attr = False if clean_html.tag in settings.ARTEX_NEGATIVE_KEYWORDS or bad_attr: # if so, redo extraction with min_text_length set to 0 doc = Document(html, negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS, min_text_length=0) doc_title = doc.short_title() # invoke the summary method to invoke readability's magic doc.summary(html_partial=True) # obtain the article as HtmlElement tree: html_tree = doc.html # clean up the article html: clean_html = cleanup(html_tree, doc_title) content = elem_content_to_string(clean_html) if title: # if the extracted title is not a subset of given title, use # the given title (b/c we assume this is more accurate, but # maybe with some unneccessary boilerplate). if not doc_title in title or doc_title == '': doc_title = title return doc_title, content
def HTMLparser(page, blog, url): title = None content = None author = None datePublished = None dateModified = None soup = BeautifulSoup(page, 'lxml') doc = Document(page) title = doc.short_title() content = BeautifulSoup(doc.summary(), 'lxml').get_text() try: application_json_ld = json.loads( soup.find('script', { 'type': 'application/ld+json' }).get_text()) except: application_json_ld = None if application_json_ld is not None: if 'author' in application_json_ld: if isinstance(application_json_ld['author'], list): author = application_json_ld['author'][0]['name'] else: author = application_json_ld['author']['name'] if 'datePublished' in application_json_ld: datestring = application_json_ld['datePublished'] datePublished = parse(datestring) if 'dateModified' in application_json_ld: datestring = application_json_ld['dateModified'] dateModified = parse(datestring) if blog == 'steemit': author = soup.find('a', {'class': 'ptc'}).get_text().split(" ")[0] datestring = soup.find('span', {'class': 'updated'})['title'].split()[0] datePublished = parse(datestring) if len(content) < 500: return None content = content.replace('\n', '') return Post(meta={'id': url}, title=title, content=content, rawContent=content, author=author, datePublished=datePublished, dateModified=dateModified, url=url)
def extract_article_info(text): """ Gets simplified page from the text Uses readability module """ doc = Document(text) # safe fetch title title = doc.short_title() if not title: title = doc.title() # content content = doc.summary(html_partial=True) image = get_page_image(doc.content()) # return return {'title': title, 'content': content, 'image': image}
def clean_html(): s = Stallion() # a = s.extract("https://www.rtbasia.com/") # a = s.extract("http://www.dytt8.net/") a = s.extract("http://v.pptv.com/show/fbGeHITqWpj7eeE.html") # response = requests.get('http://lady.163.com/19/0111/10/E57V9GIV00267VA9.html') # response = requests.get('http://www.rtbchina.com/') # response = requests.get('http://guba.eastmoney.com/news,000611,173895506.html') doc = Document(a.raw_html) # doc = Document(response.text) # print(doc.content()) print(doc.short_title()) # print(doc.title()) # print(doc.summary()) h = html2text.HTML2Text() h.ignore_links = True print(h.handle(doc.summary()))
def preliminary_parse(self): if(not self.is_downloaded): raise Exception("not downloaded") try: d = Document(self.html) self._readability_title = d.short_title() self._readability_text = d.summary() logging.debug(u"readability title: {0}".format(repr(self._readability_title))) logging.debug(u"readability text: {0}".format(repr(self._readability_text))) if(self._readability_title and self._readability_text): return except Exception as e: logging.warning("error while doing readability parse: {0}".format(str(e))) logging.debug("falling back to newspaper parse") self.newspaper_article.parse() logging.debug(u"newspaper title: {0}".format(repr(self._newspaper_title))) logging.debug(u"newspaper text: {0}".format(repr(self._newspaper_text)))
def readability(): import requests from readability import Document from bs4 import BeautifulSoup data = dict(default_data) data['message'] = "Article Extraction by Readability" data['params'] = {} data['error'] = '' data['readability'] = {} if request.method == 'GET': data['params']['url'] = request.args.get('url') if not data['params']['url']: data['error'] = '[url] parameter not found' return jsonify(data) response = requests.get(data['params']['url']) doc = Document(response.text) elif request.method == 'POST': params = request.form # postdata if not params: data['error'] = 'Missing parameters' return jsonify(data) if not params['html']: data['error'] = 'html parameter not found' return jsonify(data) doc = Document(params['html']) data['readability']['title'] = doc.title() data['readability']['short_title'] = doc.short_title() #data['readability']['content'] = doc.content() data['readability']['article_html'] = doc.summary(html_partial=True) soup = BeautifulSoup(data['readability']['article_html']) data['readability']['text'] = soup.get_text() return jsonify(data)
def complement(self): for entry in self.entries: try: response = requests.get(entry.url, timeout=10) except requests.RequestException as excp: logger.warn('Exception requesting article %s: %s', entry.url, excp.message) continue document = Document(response.content, url=response.url) # Image extraction first document._html() # Trigger parsing images = document.html.xpath( '//meta[@property="og:image"]/@content') images += document.html.xpath( '//meta[@name="twitter:image:src"]/@content') # Content extraction second entry.url = response.url entry.image = (images or [''])[0] entry.title = document.short_title() entry.content = document.summary() yield entry
def resolve_article(self, args, context, info): query = Article.get_query(context) id = args.get("article_id") title = args.get("article_content") article = query.filter( or_(ArticleModel.object_id == id, (ArticleModel.title.like("%title%")))).first() response = requests.get(article.url) doc = Document(response.text) texts = pq(response.text)('body').text() article.updated_date = int( calendar.timegm(datetime.datetime.utcnow().utctimetuple())) article.article_view_content = str( render_template('body_template.html', article_content=doc.summary(True), title=str(doc.short_title()), article=str(doc.title()), read_time=str(ReadingTime().estimate(texts, True)), base_url=article.main_url, article_url=article.url)) \ .replace("\"", "'").replace("\n", "").replace("\t", "").replace("$", "$") return article
def extract(self, item): """Creates an readability document and returns an ArticleCandidate containing article title and text. :param item: A NewscrawlerItem to parse. :return: ArticleCandidate containing the recovered article data. """ doc = Document(deepcopy(item['spider_response'].body)) description = doc.summary() article_candidate = ArticleCandidate() article_candidate.extractor = self._name article_candidate.title = doc.short_title() article_candidate.description = description article_candidate.text = self._text(item) article_candidate.topimage = self._topimage(item) article_candidate.author = self._author(item) article_candidate.publish_date = self._publish_date(item) article_candidate.language = self._language(item) return article_candidate
def parse_web_page(text): """ Generic wep page parser with readability. Used as a fallback. :param text: unicode text :return: title, article :raise ParserException: """ try: from readability import Document from readability.readability import Unparseable except ImportError: raise ParserException('readability is not installed') if not text: raise ParserException('No decoded text available, aborting!') try: doc = Document(text) except Unparseable as e: raise ParserException(e.message) else: return doc.short_title(), doc.summary(True)
def handle_data(): def cleancap(raw_cap): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', raw_cap) tmp = cleantext.split('\n') cap = list() pre = '' for line in tmp: if line.replace(' ', '') and line != pre: if '-->' in line: cap.append('') else: pre = line cap.append(line) tmp = set() for idx in range(len(cap)): if '-->' in cap[idx] and (idx >= len(cap) - 2 or '-->' in cap[idx + 2]): tmp.add(idx) tmp.add(idx + 1) final = list() for idx in range(len(cap)): if idx not in tmp: final.append(cap[idx]) return '\n'.join(final) user_level = request.form['user_level'] title = '' publish_date = '' text = request.form['text'] if (text.startswith('http://www.youtube.com') or text.startswith('http://youtube.com') or text.startswith('http://youtu.be') or text.startswith('https://www.youtube.com') or text.startswith('https://youtube.com') or text.startswith('https://youtu.be')): ydl_opts = { 'writesubtitles': True, 'writeautomaticsub': True, 'skip_download': True, # We just want to extract the info 'outtmpl': 'download/target' # file_path/target } file = '' with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download([text]) dirPath = "download" fileList = os.listdir(dirPath) if 'target.en.vtt' in fileList: file = cleancap(open('download/target.en.vtt').read()) else: file = 'There is no english substitle in this video!' for fileName in fileList: if os.path.isfile(os.path.join(dirPath, fileName)): os.remove(os.path.join(dirPath, fileName)) v_id = text.split('=')[-1] content = [v_id, file] type_ = 'youtube' r = requests.get(text) if r.status_code < 400: title = BeautifulSoup(r.text, 'html.parser').find('title').text publish_date = BeautifulSoup(r.text, 'html.parser').find( 'meta', itemprop="datePublished")['content'] elif text.startswith('http://') or text.startswith('https://'): response = requests.get(text, headers=headers) doc = Document(remove_sometag(response.text)) title = doc.short_title() publish_date = getPublishDate(response.content.decode('UTF-8')) content = doc.summary() type_ = 'url' else: content = text type_ = 'text' content = clean_content(content, type_) new,pure_text,vocab_dict = create_article(title, user_level, content, type_=='youtube', \ set(dictWord['V'].keys()), set(dictWord['N'].keys()), set(dictWord['ADJ'].keys())) store(pure_text, vocab_dict, user_level) return render_template('format.html', title=title, publish_date=publish_date, \ user_level=user_level, content=new)
def __init__(self, url, full_content=None, timeout=10): logger.info("HtmlContentExtractor.__init__: url=%s, full_content is None=%s", url, (full_content == None)) # validate if not isinstance(url, str): raise RuntimeError("url not str.") if len(url) == 0: raise RuntimeException("len(url) == 0") if full_content is not None: if not isinstance(full_content, str): raise RuntimeError("full_content not str.") if len(full_content) == 0: raise ContentNoDataException(url) # Initialize instance variable self.url = url self.title = "" self.full_content = full_content self.content = "" self.simplified_content = "" self.summary_list = "" # Get html document if self.full_content is None: logger.debug("requests.get: start. url=%s", url) try: r = requests.get(url, timeout=timeout) except requests.exceptions.RequestException as ex: logger.warn("requests.get: fail. exception=%s", repr(ex)) raise ContentRequestFailException(url) logger.debug("requests.get: end. status_code=%s, content_type=%s, len(full_content)=%s", r.status_code, r.headers["content-type"], len(r.text)) logger.debug("request result check: start.") if r.status_code == 404: raise ContentNotFoundException(url) if len(r.text) == 0: raise ContentNoDataException(url) logger.debug("request result check: end.") logger.debug("get full_content: start.") self.full_content = r.text logger.debug("get full_content: end. len(full_content)=%s", len(self.full_content)) else: logger.debug("full_content not None") # Analyze html document ## Get extracted content logger.debug("extract content: start.") doc = Document(self.full_content) self.content = doc.summary() logger.debug("extract content: end. len(content)=%s", len(self.content)) ## Get title logger.debug("get title: start.") self.title = doc.short_title() logger.debug("get title: end. title=%s", self.title) ## Get simplified content logger.debug("content simplify: start.") markdown_content = pypandoc.convert_text(self.content, "markdown_github", format="html", extra_args=["--normalize", "--no-wrap"]) self.simplified_content = pypandoc.convert_text(markdown_content, "html", format="markdown_github", extra_args=["--email-obfuscation=none"]) logger.debug("content simplify: end. len(simplified_content)=%s", len(self.simplified_content)) # Get summary logger.debug("summarize: start.") auto_abstractor = AutoAbstractor() abstractable_doc = AbstractableTopNRank() abstractable_doc.set_top_n(3) summary_list = auto_abstractor.summarize(self.simplified_content, abstractable_doc)["summarize_result"] self.summary_list = [pypandoc.convert_text(summary.strip(), "plain", format="html").strip() for summary in summary_list] logger.debug("summarize: end. len(summary_list)=%s", len(self.summary_list))
class TitleExtractor(object): def __init__(self, html): self._html = html self._title = '' self._doc = Document(html) def clean_title(self, title): spliters = [' - ', '–', '—', '-', '|', '::'] for s in spliters: if s not in title: continue tts = title.split(s) if len(tts) < 2: continue title = tts[0] break return title def get_title_method1(self): self._title = self._doc.short_title() def get_title_method2(self): # 处理特殊的网站不规则的标题 if not self._title: regex = TITLE_RE self._title = get_info(self._html, regex, fetch_one=True) def get_title_method3(self): g = Goose() article = g.extract(raw_html=self._html) self._title = article.title def get_title_method4(self): doc = lxml.html.fromstring(self._html) title = '' title_el = doc.xpath('//title') if title_el: title = title_el[0].text_content().strip() if len(title) < 7: tt = doc.xpath('//meta[@name="title"]') if tt: title = tt[0].get('content', '') if len(title) < 7: tt = doc.xpath( '//*[contains(@id, "title") or contains(@class, "title")]') if not tt: tt = doc.xpath( '//*[contains(@id, "font01") or contains(@class, "font01")]' ) for t in tt: ti = t.text_content().strip() if ti in title and len(ti) * 2 > len(title): title = ti break if len(ti) > 20: continue if len(ti) > len(title) or len(ti) > 7: title = ti self._title = title def get_title(self): self.get_title_method1() if not self._title: self.get_title_method2() if not self._title: self.get_title_method3() self._title = self.clean_title(self._title) return self._title
import urllib.request from readability import Document req = urllib.request.Request( 'https://en.wikipedia.org/wiki/%22Hello,_World!%22_program', headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15' }) with urllib.request.urlopen(req) as f: urllib_content = f.read() # print(urllib_content.decode("utf-8")) doc = Document(urllib_content) print(doc.title()) print(doc.short_title()) print(doc.summary())
import requests from readability import Document from pprint import pprint response = requests.get('https://laravel-news.com/announcing-building-a-chatbot-with-laravel-and-botman') doc = Document(response.text) # API methods: # .title() -- full title # .short_title() -- cleaned up title # .content() -- full content # .summary() -- cleaned up content data = dict() data['title'] = doc.title() data['short_title'] = doc.short_title() data['content'] = doc.content() data['summary'] = doc.summary() pprint( data )