def handle_not_found(self, wbrequest, nfe): response = super(MementoHandler, self).handle_not_found(wbrequest, nfe) if (not wbrequest.wb_url.is_query() and wbrequest.referrer and wbrequest.referrer.startswith(wbrequest.wb_prefix)): wb_url = WbUrl(wbrequest.referrer[len(wbrequest.wb_prefix):]) status = response.status_headers.get_statuscode() if status.startswith('4') and not self.skip_missing_count(wb_url): key_name = 'MISSING ' elif status.startswith('2'): key_name = 'LIVE ' else: key_name = None if key_name: page_key = redis_client.get_url_key(wb_url) ts = timestamp_now() value = (key_name + ts + ' ' + wbrequest.wb_url.url) save_value = str(timestamp_to_sec(ts)) save_value += ' ' + 'text/html' redis_client.set_embed_entry(page_key, value, save_value) return response
def __call__(self, cdx, skip_hosts, cdx_loader, wbrequest): src_url = cdx['src_url'] parts = urlparse.urlsplit(src_url) archive_host = parts.netloc if archive_host in skip_hosts: raise CaptureException('Skipping already failed: ' + archive_host) #src_url_id = WBURL_RX.sub(r'\1\2id_\4', src_url) #if src_url_id != src_url: # try_urls = [src_url_id, src_url] #else: # try_urls = [src_url] info = self.find_archive_info(archive_host) if info and info['unrewritten_url']: orig_url = info['unrewritten_url'].format(timestamp=cdx['timestamp'], url=cdx['url']) try_urls = [orig_url] else: try_urls = [src_url] wbrequest.urlrewriter.rewrite_opts['orig_src_url'] = cdx['src_url'] wbrequest.urlrewriter.rewrite_opts['archive_info'] = info self.session.cookies.clear() response = self._do_req(try_urls, archive_host, skip_hosts) if response is None: #skip_hosts.append(archive_host) raise CaptureException('Unsuccessful response, trying another') content_type = response.headers.get('content-type', 'unknown') content_type = content_type.split(';')[0] # for now, disable referrer for html to avoid links being treated as part of same page # for frames, must assemble on client side if 'text/html' in content_type: referrer = None else: referrer = wbrequest.referrer page_key = None is_embed = False if referrer and wbrequest.referrer.startswith(wbrequest.wb_prefix): wb_url = WbUrl(wbrequest.referrer[len(wbrequest.wb_prefix):]) is_embed = True else: wb_url = wbrequest.wb_url page_key = redis_client.get_url_key(wb_url) if is_embed and self.is_embed_ref(wb_url.url): orig_ref = redis_client.get_orig_from_link(page_key) if orig_ref: wb_url = WbUrl(orig_ref) page_key = redis_client.get_url_key(wb_url) elif is_embed and self.is_embed_ref(cdx['url']): redis_client.set_refer_link(wbrequest.wb_url.timestamp, cdx['url'], page_key) parts = urlparse.urlsplit(src_url) # top page if not is_embed or (wbrequest.wb_url.url == wb_url.url and wbrequest.wb_url.timestamp == wb_url.timestamp): redis_client.set_embed_entry(page_key, H_TARGET_SEC, str(cdx['sec'])) orig_ref = redis_client.get_orig_from_link(page_key) #if orig_ref: # orig_ts = orig_ref.split('/', 1)[0] # redis_client.set_embed_entry(page_key, H_REQUEST_TS, orig_ts) value = (parts.netloc + ' ' + wbrequest.wb_url.timestamp + ' ' + wbrequest.wb_url.url) redis_client.set_embed_entry(page_key, value, str(cdx['sec']) + ' ' + content_type) statusline = str(response.status_code) + ' ' + response.reason headers = response.headers.items() stream = response.raw status_headers = StatusAndHeaders(statusline, headers) #type_ = type(UrlRewriter.rewrite) #wbrequest.urlrewriter._orig_rewrite = wbrequest.urlrewriter.rewrite #wbrequest.urlrewriter.rewrite = type_(rewrite_archive, wbrequest.urlrewriter, UrlRewriter) return (status_headers, stream)