Beispiel #1
0
def extract(page):
    url = page.content_url
    content = Content(url=url, source=NATIVE)
    logging.info("fetching %r with native extractor" % (url,))
    body = page.raw_content
    try:
        soup = page_parser.parse(body, base_href=page.base_href, notify=logging.info)
        content.body = page_parser.get_body(soup)
        content.title = page_parser.get_title(soup)
    except StandardError, e:
        raise deferred.PermanentTaskFailure("%s: %s" % (type(e), e))
Beispiel #2
0
def extract(page):
	url = page.content_url
	content = Content(url=url, source=VIEWTEXT)

	viewtext_url = "http://viewtext.org/api/text?url=%(url)s&format=json&rl=false" % {'url': urllib.quote(url)}
	logging.debug("fetching: %s with viewtext extractor" % (viewtext_url,))
	response = fetch(viewtext_url, allow_truncated=False, deadline=20)
	if response.status_code >= 400:
		logging.warning("request returned status code %s\n%s" % (response.status_code, response.content))
		raise DownloadError("request returned status code %s" % (response.status_code,))

	response = json.loads(response.content)
	logging.info("got JSON response with keys: %s" % (response.keys(),))

	try:
		content.body = response['content']
		content.title = response['title']
	except KeyError, e:
		raise deferred.PermanentTaskFailure("%s: %s" % (type(e), e))