Example #1
0
    def parse_mem_value(self, m):
        iso = m['datetime']
        dt = iso_date_to_datetime(iso)
        sec = datetime_to_secs(dt)
        ts = datetime_to_timestamp(dt)
        url = m['uri']

        return MemValue(ts, sec, url)
Example #2
0
    def fetch_request(self,
                      url,
                      urlrewriter,
                      head_insert_func=None,
                      urlkey=None,
                      env=None,
                      req_headers={},
                      timestamp=None,
                      follow_redirects=False,
                      proxies=None):

        ts_err = url.split('///')

        # fixup for accidental erroneous rewrite which has ///
        # (unless file:///)
        if len(ts_err) > 1 and ts_err[0] != 'file:':
            url = 'http://' + ts_err[1]

        if url.startswith('//'):
            url = 'http:' + url

        if is_http(url):
            (status_headers, stream) = self.fetch_http(url, env, req_headers,
                                                       follow_redirects,
                                                       proxies)
        else:
            (status_headers, stream) = self.fetch_local_file(url)

        # explicit urlkey may be passed in (say for testing)
        if not urlkey:
            urlkey = canonicalize(url)

        if timestamp is None:
            timestamp = datetime_to_timestamp(datetime.datetime.utcnow())

        cdx = {
            'urlkey': urlkey,
            'timestamp': timestamp,
            'original': url,
            'statuscode': status_headers.get_statuscode(),
            'mimetype': status_headers.get_header('Content-Type'),
            'is_live': True,
        }

        result = (self.rewriter.rewrite_content(
            urlrewriter,
            status_headers,
            stream,
            head_insert_func=head_insert_func,
            urlkey=urlkey,
            cdx=cdx))

        if env:
            env['pywb.cdx'] = cdx

        return result
Example #3
0
    def fetch_request(self, url, urlrewriter,
                      head_insert_func=None,
                      urlkey=None,
                      env=None,
                      req_headers={},
                      timestamp=None,
                      follow_redirects=False,
                      proxies=None):

        ts_err = url.split('///')

        # fixup for accidental erroneous rewrite which has ///
        # (unless file:///)
        if len(ts_err) > 1 and ts_err[0] != 'file:':
            url = 'http://' + ts_err[1]

        if url.startswith('//'):
            url = 'http:' + url

        if is_http(url):
            (status_headers, stream) = self.fetch_http(url, env, req_headers,
                                                       follow_redirects,
                                                       proxies)
        else:
            (status_headers, stream) = self.fetch_local_file(url)

        # explicit urlkey may be passed in (say for testing)
        if not urlkey:
            urlkey = canonicalize(url)

        if timestamp is None:
            timestamp = datetime_to_timestamp(datetime.datetime.utcnow())

        cdx = {'urlkey': urlkey,
               'timestamp': timestamp,
               'original': url,
               'statuscode': status_headers.get_statuscode(),
               'mimetype': status_headers.get_header('Content-Type'),
               'is_live': True,
              }

        result = (self.rewriter.
                  rewrite_content(urlrewriter,
                                  status_headers,
                                  stream,
                                  head_insert_func=head_insert_func,
                                  urlkey=urlkey,
                                  cdx=cdx))

        if env:
            env['pywb.cdx'] = cdx

        return result
Example #4
0
    def get_top_frame_params(self, wbrequest, mod):
        embed_url = wbrequest.wb_url.to_str(mod=mod)

        if wbrequest.wb_url.timestamp:
            timestamp = wbrequest.wb_url.timestamp
        else:
            timestamp = datetime_to_timestamp(datetime.utcnow())

        params = dict(embed_url=embed_url,
                      wbrequest=wbrequest,
                      timestamp=timestamp,
                      url=wbrequest.wb_url.get_url(),
                      banner_html=self.banner_html)

        return params
Example #5
0
    def get_top_frame_params(self, wbrequest, mod):
        embed_url = wbrequest.wb_url.to_str(mod=mod)

        if wbrequest.wb_url.timestamp:
            timestamp = wbrequest.wb_url.timestamp
        else:
            timestamp = datetime_to_timestamp(datetime.utcnow())

        params = dict(embed_url=embed_url,
                      wbrequest=wbrequest,
                      timestamp=timestamp,
                      url=wbrequest.wb_url.get_url(),
                      banner_html=self.banner_html)

        return params
    def snapshot(self):
        coll = request.query.get('coll', '')
        if coll == '@anon':
            user = self.manager.get_anon_user()
        else:
            user, coll = self.path_parser.get_user_coll(coll)

        url = request.query.get('url', '')
        if not url or not self.manager.can_write_coll(user, coll):
            raise HTTPError(status=404, body='No Such Page')

        title = request.query.get('title', '')
        add_page = request.query.get('addpage', False)

        html_text = request.body.read()

        #host = get_host()
        host = WbRequest.make_host_prefix(request.environ)

        prefix = request.query.get('prefix', host)

        orig_html = HTMLDomUnRewriter.unrewrite_html(host, prefix, html_text)

        dt = datetime.utcnow()

        sesh_id = self.path_parser.get_coll_path(user, coll)

        target = dict(output_dir=self.path_parser.get_archive_dir(user, coll),
                      sesh_id=sesh_id.replace('/', ':'),
                      user_id=user,
                      name_prefix=self.path_parser.get_name_prefix(user, coll),
                      json_metadata={'snapshot': 'html', 'timestamp': str(dt)},
                      writer_type='-snapshot')

        if url.startswith('https://'):
            url = url.replace('https:', 'http:')

        req_headers = {'warcprox-meta': json.dumps(target),
                       'content-type': 'text/html',
                       'user-agent': request.headers.get('user-agent')
                      }

        pagedata = {'url': url,
                    'title': title,
                    'tags': ['snapshot'],
                    'ts': datetime_to_timestamp(dt)
                   }

        try:
            resp = requests.request(method='PUTRES',
                                    url=url,
                                    data=orig_html,
                                    headers=req_headers,
                                    proxies=self.warcprox_proxies,
                                    verify=False)

            if add_page:
                self.manager.add_page(user, coll, pagedata)
        except:
            return {'status': 'err'}


        return {'status': resp.status_code}
    def load_resource(self, cdx, params):
        load_url = cdx.get('load_url')
        if not load_url:
            return None

        if params.get('content_type') == VideoLoader.CONTENT_TYPE:
            return None

        input_req = params['_input_req']

        req_headers = input_req.get_req_headers()

        dt = timestamp_to_datetime(cdx['timestamp'])

        if cdx.get('memento_url'):
            req_headers['Accept-Datetime'] = datetime_to_http_date(dt)

        method = input_req.get_req_method()
        data = input_req.get_req_body()

        p = PreparedRequest()
        p.prepare_url(load_url, None)
        p.prepare_headers(None)
        p.prepare_auth(None, load_url)

        auth = p.headers.get('Authorization')
        if auth:
            req_headers['Authorization'] = auth

        load_url = p.url

        try:
            upstream_res = self.pool.urlopen(method=method,
                                             url=load_url,
                                             body=data,
                                             headers=req_headers,
                                             redirect=False,
                                             assert_same_host=False,
                                             preload_content=False,
                                             decode_content=False,
                                             retries=self.num_retries,
                                             timeout=params.get('_timeout'))

        except Exception as e:
            raise LiveResourceException(load_url)

        memento_dt = upstream_res.headers.get('Memento-Datetime')
        if memento_dt:
            dt = http_date_to_datetime(memento_dt)
            cdx['timestamp'] = datetime_to_timestamp(dt)
        elif cdx.get('memento_url'):
        # if 'memento_url' set and no Memento-Datetime header present
        # then its an error
            return None

        agg_type = upstream_res.headers.get('WebAgg-Type')
        if agg_type == 'warc':
            cdx['source'] = unquote(upstream_res.headers.get('WebAgg-Source-Coll'))
            return None, upstream_res.headers, upstream_res

        self.raise_on_self_redirect(params, cdx,
                                    str(upstream_res.status),
                                    upstream_res.headers.get('Location'))


        if upstream_res.version == 11:
            version = '1.1'
        else:
            version = '1.0'

        status = 'HTTP/{version} {status} {reason}\r\n'
        status = status.format(version=version,
                               status=upstream_res.status,
                               reason=upstream_res.reason)

        http_headers_buff = status

        orig_resp = upstream_res._original_response

        try:  #pragma: no cover
        #PY 3
            resp_headers = orig_resp.headers._headers
            for n, v in resp_headers:
                if n.lower() in self.SKIP_HEADERS:
                    continue

                http_headers_buff += n + ': ' + v + '\r\n'
        except:  #pragma: no cover
        #PY 2
            resp_headers = orig_resp.msg.headers
            for n, v in zip(orig_resp.getheaders(), resp_headers):
                if n in self.SKIP_HEADERS:
                    continue

                http_headers_buff += v

        http_headers_buff += '\r\n'
        http_headers_buff = http_headers_buff.encode('latin-1')

        try:
            fp = upstream_res._fp.fp
            if hasattr(fp, 'raw'):  #pragma: no cover
                fp = fp.raw
            remote_ip = fp._sock.getpeername()[0]
        except:  #pragma: no cover
            remote_ip = None

        warc_headers = {}

        warc_headers['WARC-Type'] = 'response'
        warc_headers['WARC-Record-ID'] = self._make_warc_id()
        warc_headers['WARC-Target-URI'] = cdx['url']
        warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
        if remote_ip:
            warc_headers['WARC-IP-Address'] = remote_ip

        warc_headers['Content-Type'] = 'application/http; msgtype=response'

        self._set_content_len(upstream_res.headers.get('Content-Length', -1),
                              warc_headers,
                              len(http_headers_buff))

        warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
        return (warc_headers, http_headers_buff, upstream_res)
Example #8
0
    def snapshot(self):
        coll = request.query.get('coll', '')
        if coll == '@anon':
            user = self.manager.get_anon_user()
        else:
            user, coll = self.path_parser.get_user_coll(coll)

        url = request.query.get('url', '')
        if not url or not self.manager.can_write_coll(user, coll):
            raise HTTPError(status=404, body='No Such Page')

        title = request.query.get('title', '')
        add_page = request.query.get('addpage', False)

        html_text = request.body.read().decode('utf-8')

        #host = get_host()
        host = WbRequest.make_host_prefix(request.environ)

        prefix = request.query.get('prefix', host)

        orig_html = HTMLDomUnRewriter.unrewrite_html(host, prefix, html_text)

        dt = datetime.utcnow()

        sesh_id = self.path_parser.get_coll_path(user, coll)

        target = dict(output_dir=self.path_parser.get_archive_dir(user, coll),
                      sesh_id=sesh_id.replace('/', ':'),
                      user_id=user,
                      name_prefix=self.path_parser.get_name_prefix(user, coll),
                      json_metadata={'snapshot': 'html', 'timestamp': str(dt)},
                      writer_type='-snapshot')

        if url.startswith('https://'):
            url = url.replace('https:', 'http:')

        req_headers = {'warcprox-meta': json.dumps(target),
                       'content-type': 'text/html',
                       'user-agent': request.headers.get('user-agent')
                      }

        pagedata = {'url': url,
                    'title': title,
                    'tags': ['snapshot'],
                    'ts': datetime_to_timestamp(dt)
                   }

        try:
            resp = requests.request(method='PUTRES',
                                    url=url,
                                    data=orig_html.encode('utf-8'),
                                    headers=req_headers,
                                    proxies=self.warcprox_proxies,
                                    verify=False)

            if add_page:
                self.manager.add_page(user, coll, pagedata)
        except Exception as e:
            import traceback
            traceback.print_exc()
            return {'status': 'err'}


        return {'status': resp.status_code}