def cached_page(site, url_path, spider_name='toc'): handle_client_ip() site = base64.standard_b64decode(site.encode()).decode() url_path = base64.standard_b64decode(url_path.encode()).decode() url_site = SiteSchemas.get(site).get(SSK.URL) url = url_site + url_path origin_encoding = SiteSchemas.get(site).get(SSK.ENCODING, 'utf-8') aid = request.args.get('aid', default=None, type=int) from moltspider.consts import Schemas from moltspider.parser import iter_items from scrapy.utils.misc import load_object from scrapy.utils.project import get_project_settings from scrapy.http.request import Request from scrapy.http.response.html import HtmlResponse from scrapy.utils.gz import gunzip from scrapy.downloadermiddlewares.httpcompression import ACCEPTED_ENCODINGS try: import brotli except: pass import zlib settings = get_project_settings() storage = load_object(settings['HTTPCACHE_STORAGE'])(settings) body = None spider_req = Request(url) if spider_name == Spiders.META: from moltspider.spiders.meta import MetaSpider spider = MetaSpider() schema_name = Schemas.META_PAGE elif spider_name == Spiders.TOC: from moltspider.spiders.toc import TocSpider spider = TocSpider schema_name = Schemas.TOC_PAGE else: raise Exception('No support for spider "%s"\'s cache page' % spider_name) cachedresponse = storage.retrieve_response(spider, spider_req) if cachedresponse: content_encoding = cachedresponse.headers.getlist('Content-Encoding') if content_encoding: encoding = content_encoding.pop() if encoding == b'gzip' or encoding == b'x-gzip': body = gunzip(cachedresponse.body) if encoding == b'deflate': try: body = zlib.decompress(body) except zlib.error: # ugly hack to work with raw deflate content that may # be sent by microsoft servers. For more information, see: # http://carsten.codimi.de/gzip.yaws/ # http://www.port80software.com/200ok/archive/2005/10/31/868.aspx # http://www.gzip.org/zlib/zlib_faq.html#faq38 body = zlib.decompress(body, -15) if encoding == b'br' and b'br' in ACCEPTED_ENCODINGS: body = brotli.decompress(body) if body: if spider_name == Spiders.TOC and aid: sb = [] colspan = 4 i = 0 scrapy_resp = HtmlResponse(url) scrapy_resp = scrapy_resp.replace(body=body, encoding=origin_encoding) sb.append('<table width="1000px" align="center"><tr>') for item in iter_items(spider, scrapy_resp, [ site, ], schema_name): if i % colspan == 0: sb.append('</tr><tr>') item['_'] = url_site sb.append('<td><a href="%(_)s%(url)s">%(name)s</a></td>' % item) del item['_'] i += 1 sb.append('</tr></table>') body = '\n'.join(sb) body = render_template_string(template_page, content=body) else: body = body.decode(encoding=origin_encoding) else: body = '%s (%s) not found in cache.' % (url, origin_encoding) resp = make_response(body) resp.headers['Content-Type'] = 'text/html; charset=utf-8' return resp