def extract(page): url = page.content_url content = Content(url=url, source=NATIVE) logging.info("fetching %r with native extractor" % (url,)) body = page.raw_content try: soup = page_parser.parse(body, base_href=page.base_href, notify=logging.info) content.body = page_parser.get_body(soup) content.title = page_parser.get_title(soup) except StandardError, e: raise deferred.PermanentTaskFailure("%s: %s" % (type(e), e))
def extract(page): url = page.content_url content = Content(url=url, source=VIEWTEXT) viewtext_url = "http://viewtext.org/api/text?url=%(url)s&format=json&rl=false" % {'url': urllib.quote(url)} logging.debug("fetching: %s with viewtext extractor" % (viewtext_url,)) response = fetch(viewtext_url, allow_truncated=False, deadline=20) if response.status_code >= 400: logging.warning("request returned status code %s\n%s" % (response.status_code, response.content)) raise DownloadError("request returned status code %s" % (response.status_code,)) response = json.loads(response.content) logging.info("got JSON response with keys: %s" % (response.keys(),)) try: content.body = response['content'] content.title = response['title'] except KeyError, e: raise deferred.PermanentTaskFailure("%s: %s" % (type(e), e))