def _redirect_if_needed(self, wbrequest, cdx): res = super(RedirectTrackReplayView, self)._redirect_if_needed(wbrequest, cdx) if res: loc = res.status_headers.get_header('Location') if loc and loc.startswith(wbrequest.wb_prefix): loc = loc[len(wbrequest.wb_prefix):] loc_url = WbUrl(loc) page_key = redis_client.get_url_key(wbrequest.wb_url) redis_client.set_refer_link(loc_url.timestamp, loc_url.url, page_key) return res
def __call__(self, cdx, skip_hosts, cdx_loader, wbrequest): src_url = cdx['src_url'] parts = urlparse.urlsplit(src_url) archive_host = parts.netloc if archive_host in skip_hosts: raise CaptureException('Skipping already failed: ' + archive_host) #src_url_id = WBURL_RX.sub(r'\1\2id_\4', src_url) #if src_url_id != src_url: # try_urls = [src_url_id, src_url] #else: # try_urls = [src_url] info = self.find_archive_info(archive_host) if info and info['unrewritten_url']: orig_url = info['unrewritten_url'].format(timestamp=cdx['timestamp'], url=cdx['url']) try_urls = [orig_url] else: try_urls = [src_url] wbrequest.urlrewriter.rewrite_opts['orig_src_url'] = cdx['src_url'] wbrequest.urlrewriter.rewrite_opts['archive_info'] = info self.session.cookies.clear() response = self._do_req(try_urls, archive_host, skip_hosts) if response is None: #skip_hosts.append(archive_host) raise CaptureException('Unsuccessful response, trying another') content_type = response.headers.get('content-type', 'unknown') content_type = content_type.split(';')[0] # for now, disable referrer for html to avoid links being treated as part of same page # for frames, must assemble on client side if 'text/html' in content_type: referrer = None else: referrer = wbrequest.referrer page_key = None is_embed = False if referrer and wbrequest.referrer.startswith(wbrequest.wb_prefix): wb_url = WbUrl(wbrequest.referrer[len(wbrequest.wb_prefix):]) is_embed = True else: wb_url = wbrequest.wb_url page_key = redis_client.get_url_key(wb_url) if is_embed and self.is_embed_ref(wb_url.url): orig_ref = redis_client.get_orig_from_link(page_key) if orig_ref: wb_url = WbUrl(orig_ref) page_key = redis_client.get_url_key(wb_url) elif is_embed and self.is_embed_ref(cdx['url']): redis_client.set_refer_link(wbrequest.wb_url.timestamp, cdx['url'], page_key) parts = urlparse.urlsplit(src_url) # top page if not is_embed or (wbrequest.wb_url.url == wb_url.url and wbrequest.wb_url.timestamp == wb_url.timestamp): redis_client.set_embed_entry(page_key, H_TARGET_SEC, str(cdx['sec'])) orig_ref = redis_client.get_orig_from_link(page_key) #if orig_ref: # orig_ts = orig_ref.split('/', 1)[0] # redis_client.set_embed_entry(page_key, H_REQUEST_TS, orig_ts) value = (parts.netloc + ' ' + wbrequest.wb_url.timestamp + ' ' + wbrequest.wb_url.url) redis_client.set_embed_entry(page_key, value, str(cdx['sec']) + ' ' + content_type) statusline = str(response.status_code) + ' ' + response.reason headers = response.headers.items() stream = response.raw status_headers = StatusAndHeaders(statusline, headers) #type_ = type(UrlRewriter.rewrite) #wbrequest.urlrewriter._orig_rewrite = wbrequest.urlrewriter.rewrite #wbrequest.urlrewriter.rewrite = type_(rewrite_archive, wbrequest.urlrewriter, UrlRewriter) return (status_headers, stream)