Example #1
0
    def _do_request(self, method, load_url, data, req_headers, params,
                    is_live):
        adapter = DefaultAdapters.live_adapter if is_live else DefaultAdapters.remote_adapter
        max_retries = adapter.max_retries

        if SOCKS_PROXIES:
            conn = adapter.get_connection(load_url, SOCKS_PROXIES)
        else:
            conn = adapter.poolmanager

        try:
            upstream_res = conn.urlopen(method=method,
                                        url=load_url,
                                        body=data,
                                        headers=req_headers,
                                        redirect=False,
                                        assert_same_host=False,
                                        preload_content=False,
                                        decode_content=False,
                                        retries=max_retries,
                                        timeout=params.get('_timeout'))

            return upstream_res

        except Exception as e:
            if logger.isEnabledFor(logging.DEBUG):
                import traceback
                traceback.print_exc()
                logger.debug('FAILED: ' + method + ' ' + load_url + ': ' +
                             str(e))

            raise LiveResourceException(load_url)
Example #2
0
    def raise_on_self_redirect(self, params, cdx, status_code, location_url):
        """
        Check if response is a 3xx redirect to the same url
        If so, reject this capture to avoid causing redirect loop
        """
        if cdx.get('is_live'):
            return

        if not status_code.startswith('3') or status_code == '304':
            return

        request_url = params['url'].lower()
        if not location_url:
            return

        location_url = location_url.lower()
        if location_url.startswith('/'):
            host = urlsplit(cdx['url']).netloc
            location_url = host + location_url

        location_url = location_url.split('://', 1)[-1].rstrip('/')
        request_url = request_url.split('://', 1)[-1].rstrip('/')

        if request_url == location_url:
            msg = 'Self Redirect {0} -> {1}'
            msg = msg.format(request_url, location_url)
            raise LiveResourceException(msg)
Example #3
0
    def handle_request(self, wbrequest):
        if wbrequest.wb_url.is_query():
            type_ = wbrequest.wb_url.LATEST_REPLAY
            url = wbrequest.urlrewriter.get_new_url(type=type_, timestamp='')
            return WbResponse.redir_response(url)

        try:
            return self.render_content(wbrequest)

        except Exception as exc:
            import traceback
            err_details = traceback.format_exc()
            print(err_details)

            url = wbrequest.wb_url.url
            msg = 'Could not load the url from the live web: ' + url
            raise LiveResourceException(msg=msg, url=url)
Example #4
0
    def raise_on_self_redirect(self, params, cdx, status_code, location_url):
        """
        Check if response is a 3xx redirect to the same url
        If so, reject this capture to avoid causing redirect loop
        """
        if cdx.get('is_live'):
            return

        if not status_code.startswith('3') or status_code == '304':
            return

        request_url = params['url'].lower()
        if not location_url:
            return

        location_url = location_url.lower()
        if location_url.startswith('/'):
            host = urlsplit(cdx['url']).netloc
            location_url = host + location_url

        location_url = location_url.split('://', 1)[-1].rstrip('/')
        request_url = request_url.split('://', 1)[-1].rstrip('/')

        self_redir = False
        orig_key = params.get('sr-urlkey') or cdx['urlkey']

        if request_url == location_url:
            self_redir = True

        # if new location canonicalized matches old key, also self-redirect
        elif canonicalize(location_url) == orig_key:
            self_redir = True

        if self_redir:
            msg = 'Self Redirect {0} -> {1}'
            msg = msg.format(request_url, location_url)
            params['sr-urlkey'] = orig_key
            raise LiveResourceException(msg)
Example #5
0
    def _do_request(self, method, load_url, data, req_headers, params,
                    is_live):
        adapter = DefaultAdapters.live_adapter if is_live else DefaultAdapters.remote_adapter
        max_retries = adapter.max_retries

        # get either the poolmanager or proxy manager to handle this connection
        if self.socks_proxy and not os.environ.get('SOCKS_DISABLE'):
            manager = adapter.proxy_manager_for(self.socks_proxy)
        else:
            manager = adapter.poolmanager

        upstream_res = None
        try:
            upstream_res = manager.urlopen(method=method,
                                           url=load_url,
                                           body=data,
                                           headers=req_headers,
                                           redirect=False,
                                           assert_same_host=False,
                                           preload_content=False,
                                           decode_content=False,
                                           retries=max_retries,
                                           timeout=params.get('_timeout'))

            return upstream_res

        except Exception as e:
            if upstream_res:
                no_except_close(upstream_res)
            if logger.isEnabledFor(logging.DEBUG):
                import traceback
                traceback.print_exc()
                logger.debug('FAILED: ' + method + ' ' + load_url + ': ' +
                             str(e))

            raise LiveResourceException(load_url)
Example #6
0
    def load_resource(self, cdx, params):
        load_url = cdx.get('load_url')
        if not load_url:
            return None

        if params.get('content_type') == VideoLoader.CONTENT_TYPE:
            return None

        if self.forward_proxy_prefix and not cdx.get('is_live'):
            load_url = self.forward_proxy_prefix + load_url

        input_req = params['_input_req']

        req_headers = input_req.get_req_headers()

        dt = timestamp_to_datetime(cdx['timestamp'])

        if cdx.get('memento_url'):
            req_headers['Accept-Datetime'] = datetime_to_http_date(dt)

        method = input_req.get_req_method()
        data = input_req.get_req_body()

        p = PreparedRequest()
        try:
            p.prepare_url(load_url, None)
        except:
            raise LiveResourceException(load_url)
        p.prepare_headers(None)
        p.prepare_auth(None, load_url)

        auth = p.headers.get('Authorization')
        if auth:
            req_headers['Authorization'] = auth

        load_url = p.url

        # host is set to the actual host for live loading
        # ensure it is set to the load_url host
        if not cdx.get('is_live'):
            #req_headers.pop('Host', '')
            req_headers['Host'] = urlsplit(p.url).netloc

            referrer = cdx.get('set_referrer')
            if referrer:
                req_headers['Referer'] = referrer

        upstream_res = self._do_request_with_redir_check(
            method, load_url, data, req_headers, params, cdx)

        memento_dt = upstream_res.headers.get('Memento-Datetime')
        if memento_dt:
            dt = http_date_to_datetime(memento_dt)
            cdx['timestamp'] = datetime_to_timestamp(dt)
        elif cdx.get('memento_url'):
            # if 'memento_url' set and no Memento-Datetime header present
            # then its an error
            return None

        agg_type = upstream_res.headers.get('Warcserver-Type')
        if agg_type == 'warc':
            cdx['source'] = unquote(
                upstream_res.headers.get('Warcserver-Source-Coll'))
            return None, upstream_res.headers, upstream_res

        if upstream_res.version == 11:
            version = '1.1'
        else:
            version = '1.0'

        status = 'HTTP/{version} {status} {reason}\r\n'
        status = status.format(version=version,
                               status=upstream_res.status,
                               reason=upstream_res.reason)

        http_headers_buff = status

        orig_resp = upstream_res._original_response

        try:  #pragma: no cover
            #PY 3
            resp_headers = orig_resp.headers._headers
            for n, v in resp_headers:
                nl = n.lower()
                if nl in self.SKIP_HEADERS:
                    continue

                if nl in self.UNREWRITE_HEADERS:
                    v = self.unrewrite_header(cdx, v)

                http_headers_buff += n + ': ' + v + '\r\n'

            http_headers_buff += '\r\n'

            try:
                # http headers could be encoded as utf-8 (though non-standard)
                # first try utf-8 encoding
                http_headers_buff = http_headers_buff.encode('utf-8')
            except:
                # then, fall back to latin-1
                http_headers_buff = http_headers_buff.encode('latin-1')

        except:  #pragma: no cover
            #PY 2
            resp_headers = orig_resp.msg.headers

            for line in resp_headers:
                n, v = line.split(':', 1)
                n = n.lower()
                v = v.strip()

                if n in self.SKIP_HEADERS:
                    continue

                new_v = v
                if n in self.UNREWRITE_HEADERS:
                    new_v = self.unrewrite_header(cdx, v)

                if new_v != v:
                    http_headers_buff += n + ': ' + new_v + '\r\n'
                else:
                    http_headers_buff += line

            # if python2, already byte headers, so leave as is
            http_headers_buff += '\r\n'

        try:
            fp = upstream_res._fp.fp
            if hasattr(fp, 'raw'):  #pragma: no cover
                fp = fp.raw
            remote_ip = fp._sock.getpeername()[0]
        except:  #pragma: no cover
            remote_ip = None

        warc_headers = {}

        warc_headers['WARC-Type'] = 'response'
        warc_headers['WARC-Record-ID'] = self._make_warc_id()
        warc_headers['WARC-Target-URI'] = cdx['url']
        warc_headers['WARC-Date'] = datetime_to_iso_date(dt)

        if not cdx.get('is_live'):
            now = datetime.datetime.utcnow()
            warc_headers['WARC-Source-URI'] = cdx.get('load_url')
            warc_headers['WARC-Creation-Date'] = datetime_to_iso_date(now)

        if remote_ip:
            warc_headers['WARC-IP-Address'] = remote_ip

        ct = upstream_res.headers.get('Content-Type')
        if ct:
            metadata = self.get_custom_metadata(ct, dt)
            if metadata:
                warc_headers['WARC-JSON-Metadata'] = json.dumps(metadata)

        warc_headers['Content-Type'] = 'application/http; msgtype=response'

        if method == 'HEAD':
            content_len = 0
        else:
            content_len = upstream_res.headers.get('Content-Length', -1)

        self._set_content_len(content_len, warc_headers,
                              len(http_headers_buff))

        warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
        return (warc_headers, http_headers_buff, upstream_res)