Exemple #1
0
    def test_timegate_latest_request_timestamp(self):
        """
        TimeGate with no Accept-Datetime header
        """

        dt = 'Mon, 27 Jan 2014 17:12:39 GMT'
        resp = self.testapp.get(
            '/pywb-non-exact/http://www.iana.org/_css/2013.1/screen.css')

        assert resp.status_int == 302

        assert resp.headers[VARY] == 'accept-datetime'

        links = self.get_links(resp)
        assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"' in links
        assert self.make_timemap_link(
            'http://www.iana.org/_css/2013.1/screen.css',
            coll='pywb-non-exact') in links
        assert self.make_memento_link(
            'http://www.iana.org/_css/2013.1/screen.css',
            '20140127171239',
            dt,
            coll='pywb-non-exact') in links

        assert MEMENTO_DATETIME not in resp.headers

        assert '/pywb-non-exact/' in resp.headers['Location']

        wburl = resp.headers['Location'].split('/pywb-non-exact/')[-1]
        ts = wburl.split('/')[0]
        assert len(ts) == 14
        assert timestamp_now() >= ts
    def handle_not_found(self, wbrequest, nfe):
        response = super(MementoHandler, self).handle_not_found(wbrequest, nfe)

        if (not wbrequest.wb_url.is_query() and
            wbrequest.referrer and
            wbrequest.referrer.startswith(wbrequest.wb_prefix)):

            wb_url = WbUrl(wbrequest.referrer[len(wbrequest.wb_prefix):])

            status = response.status_headers.get_statuscode()

            if status.startswith('4') and not self.skip_missing_count(wb_url):
                key_name = 'MISSING '
            elif status.startswith('2'):
                key_name = 'LIVE '
            else:
                key_name = None

            if key_name:
                page_key = redis_client.get_url_key(wb_url)

                ts = timestamp_now()

                value = (key_name + ts + ' ' +
                          wbrequest.wb_url.url)

                save_value = str(timestamp_to_sec(ts))
                save_value += ' ' + 'text/html'

                redis_client.set_embed_entry(page_key, value, save_value)

        return response
Exemple #3
0
    def test_timegate_latest_request_timestamp(self):
        """
        TimeGate with no Accept-Datetime header
        """

        dt = 'Mon, 27 Jan 2014 17:12:39 GMT'
        resp = self.testapp.get('/pywb-non-exact/http://www.iana.org/_css/2013.1/screen.css')

        assert resp.status_int == 302

        assert resp.headers[VARY] == 'accept-datetime'

        links = self.get_links(resp)
        assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"' in links
        assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css', coll='pywb-non-exact') in links
        assert self.make_memento_link('http://www.iana.org/_css/2013.1/screen.css', '20140127171239', dt, coll='pywb-non-exact') in links

        assert MEMENTO_DATETIME not in resp.headers

        assert '/pywb-non-exact/' in resp.headers['Location']

        wburl = resp.headers['Location'].split('/pywb-non-exact/')[-1]
        ts = wburl.split('/')[0]
        assert len(ts) == 14
        assert timestamp_now() >= ts
    def get_top_frame(self, wb_url,
                      wb_prefix,
                      host_prefix,
                      env,
                      frame_mod,
                      replay_mod,
                      coll='',
                      extra_params=None):

        embed_url = wb_url.to_str(mod=replay_mod)

        if wb_url.timestamp:
            timestamp = wb_url.timestamp
        else:
            timestamp = timestamp_now()

        wbrequest = {'host_prefix': host_prefix,
                     'wb_prefix': wb_prefix,
                     'wb_url': wb_url,
                     'coll': coll,

                     'options': {'frame_mod': frame_mod,
                                 'replay_mod': replay_mod},
                    }

        params = dict(embed_url=embed_url,
                      wbrequest=wbrequest,
                      timestamp=timestamp,
                      url=wb_url.get_url(),
                      banner_html=self.banner_file)

        if extra_params:
            params.update(extra_params)

        return self.render_to_string(env, **params)
def test_live():
    url = 'http://example.com/'
    source = LiveIndexSource()
    res, errs = query_single_source(source, dict(url=url))

    expected = 'com,example)/ {0} http://example.com/'.format(timestamp_now())

    assert(key_ts_res(res, 'load_url') == expected)
    assert(errs == {})
    def load_index(self, params):
        cdx = CDXObject()
        cdx['urlkey'] = params.get('key').decode('utf-8')

        closest = params.get('closest')
        cdx['timestamp'] = closest if closest else timestamp_now()
        cdx['url'] = params['url']
        cdx['load_url'] = res_template(self.proxy_url, params)
        cdx['memento_url'] = cdx['load_url']
        return self._do_load(cdx, params)
    def __call__(self, params):
        if params.get('closest') == 'now':
            params['closest'] = timestamp_now()

        content_type = params.get('content_type')
        if content_type:
            params['filter'] = '=mime:' + content_type

        query = CDXQuery(params)

        cdx_iter, errs = self.load_index(query.params)

        cdx_iter = process_cdx(cdx_iter, query)
        return cdx_iter, dict(errs)
Exemple #8
0
    def _redirect_if_needed(self, wbrequest, cdx):
        if wbrequest.options['is_proxy']:
            return None

        if wbrequest.custom_params.get('noredir'):
            return None

        is_timegate = (wbrequest.options.get('is_timegate', False))
        if not is_timegate:
            is_timegate = wbrequest.wb_url.is_latest_replay()

        redir_needed = is_timegate

        if not redir_needed and self.redir_to_exact:
            redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp)

        if not redir_needed:
            return None

        if self.enable_range_cache and wbrequest.extract_range():
            return None

        if is_timegate and not self.redir_to_exact:
            timestamp = timestamp_now()
        else:
            timestamp = cdx['timestamp']

        new_url = (wbrequest.urlrewriter.
                   get_new_url(timestamp=timestamp,
                               url=cdx['url']))

        if wbrequest.method == 'POST':
            #   FF shows a confirm dialog, so can't use 307 effectively
            #   was: statusline = '307 Same-Method Internal Redirect'
            return None
        elif is_timegate:
            statusline = '302 Found'
        else:
            # clear cdx line to indicate internal redirect
            statusline = '302 Internal Redirect'
            cdx = None

        status_headers = StatusAndHeaders(statusline,
                                          [('Location', new_url)])

        return self.response_class(status_headers,
                                   wbrequest=wbrequest,
                                   cdx=cdx)
Exemple #9
0
    def _redirect_if_needed(self, wbrequest, cdx):
        if wbrequest.options['is_proxy']:
            return None

        if wbrequest.custom_params.get('noredir'):
            return None

        is_timegate = (wbrequest.options.get('is_timegate', False))
        if not is_timegate:
            is_timegate = wbrequest.wb_url.is_latest_replay()

        redir_needed = is_timegate

        if not redir_needed and self.redir_to_exact:
            redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp)

        if not redir_needed:
            return None

        if self.enable_range_cache and wbrequest.extract_range():
            return None

        if is_timegate and not self.redir_to_exact:
            timestamp = timestamp_now()
        else:
            timestamp = cdx['timestamp']

        new_url = (wbrequest.urlrewriter.get_new_url(timestamp=timestamp,
                                                     url=cdx['url']))

        if wbrequest.method == 'POST':
            #   FF shows a confirm dialog, so can't use 307 effectively
            #   was: statusline = '307 Same-Method Internal Redirect'
            return None
        elif is_timegate:
            statusline = '302 Found'
        else:
            # clear cdx line to indicate internal redirect
            statusline = '302 Internal Redirect'
            cdx = None

        status_headers = StatusAndHeaders(statusline, [('Location', new_url)])

        return self.response_class(status_headers,
                                   wbrequest=wbrequest,
                                   cdx=cdx)
Exemple #10
0
    def _get_timemap_query(self, params):
        from_ts = params.get('from')
        if from_ts:
            from_ts = pad_timestamp(from_ts, EARLIEST_DATE)
        else:
            from_ts = EARLIEST_DATE

        to_ts = params.get('to')
        if not to_ts:
            to_ts = timestamp_now()
        else:
            to_ts = pad_timestamp(to_ts, LATEST_DATE)

        query = 'exacturlexpand:{0} date:{1}-{2}'.format(
            params.get('url'), from_ts, to_ts)

        return query
    def _get_timemap_query(self, params):
        from_ts = params.get('from')
        if from_ts:
            from_ts = pad_timestamp(from_ts, EARLIEST_DATE)
        else:
            from_ts = EARLIEST_DATE

        to_ts = params.get('to')
        if not to_ts:
            to_ts = timestamp_now()
        else:
            to_ts = pad_timestamp(to_ts, LATEST_DATE)

        query = 'exacturlexpand:{0} date:{1}-{2}'.format(params.get('url'),
                                                         from_ts, to_ts)

        return query
Exemple #12
0
    def test_redirect_non_exact_latest_replay_ts(self):
        resp = self.testapp.get('/pywb-non-exact/http://example.com/')
        assert resp.status_int == 302

        assert resp.headers['Location'].endswith('/http://example.com')

        # extract ts, which should be current time
        ts = resp.headers['Location'].rsplit('/http://')[0].rsplit('/', 1)[-1]
        assert len(ts) == 14, ts
        resp = resp.follow()

        self._assert_basic_html(resp)

        # ensure the current ts is present in the links
        assert '"{0}"'.format(ts) in resp.body
        assert '/pywb-non-exact/{0}/http://www.iana.org/domains/example'.format(ts) in resp.body

        # ensure ts is current ts
        assert timestamp_now() >= ts, ts
Exemple #13
0
    def add_page(self, user, coll, rec, pagedata):
        self.assert_can_write(user, coll)

        key = self.page_key.format(user=user, coll=coll, rec=rec)

        url = pagedata['url']

        if not pagedata.get('timestamp'):
            pagedata['timestamp'] = self._get_url_ts(user, coll, rec, url)

            if not pagedata['timestamp']:
                pagedata['timestamp'] = timestamp_now()

        pagedata_json = json.dumps(pagedata).encode('utf-8')

        self.redis.hset(key, pagedata['url'] + ' ' + pagedata['timestamp'],
                        pagedata_json)

        return {}
Exemple #14
0
    def test_redirect_non_exact_latest_replay_ts(self):
        resp = self.testapp.get('/pywb-non-exact/http://example.com/')
        assert resp.status_int == 302

        assert resp.headers['Location'].endswith('/http://example.com')

        # extract ts, which should be current time
        ts = resp.headers['Location'].rsplit('/http://')[0].rsplit('/', 1)[-1]
        assert len(ts) == 14, ts
        resp = resp.follow()

        self._assert_basic_html(resp)

        # ensure the current ts is present in the links
        assert '"{0}"'.format(ts) in resp.body
        assert '/pywb-non-exact/{0}/http://www.iana.org/domains/example'.format(
            ts) in resp.body

        # ensure ts is current ts
        assert timestamp_now() >= ts, ts
Exemple #15
0
    def download_coll():
        coll = request.query.get('coll')
        ts = timestamp_now()
        if coll.startswith('@anon'):
            user = manager.get_anon_user()
            filename = 'webarchive-all-{0}.warc.gz'.format(ts)
        else:
            user, coll = path_parser.get_user_coll(coll)
            filename = '{0}-{1}-all.warc.gz'.format(user, coll, ts)

        res = manager.download_all(user, coll)

        if not res:
            raise HTTPError(status=404, body='No Download Data Available')

        length, func = res
        response.headers['Content-Type'] = 'text/plain'
        response.headers['Content-Disposition'] = 'attachment; filename=' + filename
        response.headers['Content-Length'] = length
        response.body = func()
        return response
Exemple #16
0
    def download_coll():
        coll = request.query.get('coll')
        ts = timestamp_now()
        if coll.startswith('@anon'):
            user = manager.get_anon_user()
            filename = 'webarchive-all-{0}.warc.gz'.format(ts)
        else:
            user, coll = path_parser.get_user_coll(coll)
            filename = '{0}-{1}-all.warc.gz'.format(user, coll, ts)

        res = manager.download_all(user, coll)

        if not res:
            raise HTTPError(status=404, body='No Download Data Available')

        length, func = res
        response.headers['Content-Type'] = 'text/plain'
        response.headers[
            'Content-Disposition'] = 'attachment; filename=' + filename
        response.headers['Content-Length'] = length
        response.body = func()
        return response
Exemple #17
0
    def import_pages(self, user, coll, rec, pagelist):
        self.assert_can_admin(user, coll)

        key = self.page_key.format(user=user, coll=coll, rec=rec)

        pagemap = {}

        for pagedata in pagelist:
            url = pagedata['url']

            if not pagedata.get('timestamp'):
                pagedata['timestamp'] = self._get_url_ts(user, coll, rec, url)

                if not pagedata['timestamp']:
                    pagedata['timestamp'] = timestamp_now()

            pagedata_json = json.dumps(pagedata).encode('utf-8')

            pagemap[pagedata['url'] + ' ' +
                    pagedata['timestamp']] = pagedata_json

        self.redis.hmset(key, pagemap)

        return {}
Exemple #18
0
    def fetch_request(self, url, urlrewriter,
                      head_insert_func=None,
                      urlkey=None,
                      env=None,
                      req_headers={},
                      timestamp=None,
                      follow_redirects=False,
                      skip_recording=False,
                      verify=True,
                      remote_only=True):

        ts_err = url.split('///')

        # fixup for accidental erroneous rewrite which has ///
        # (unless file:///)
        if len(ts_err) > 1 and ts_err[0] != 'file:':
            url = 'http://' + ts_err[1]

        if url.startswith('//'):
            url = 'http:' + url

        if remote_only or is_http(url):
            is_remote = True
        else:
            is_remote = False
            if not url.startswith('file:'):
                url = to_file_url(url)

        # explicit urlkey may be passed in (say for testing)
        if not urlkey:
            urlkey = canonicalize(url)

        if is_remote:
            (status_headers, stream) = self.fetch_http(url, urlkey, env,
                                                       req_headers,
                                                       follow_redirects,
                                                       skip_recording,
                                                       verify)
        else:
            (status_headers, stream) = self.fetch_local_file(url)

        if timestamp is None:
            timestamp = timestamp_now()

        cdx = {'urlkey': urlkey,
               'timestamp': timestamp,
               'url': url,
               'status': status_headers.get_statuscode(),
               'mime': status_headers.get_header('Content-Type'),
               'is_live': True,
              }

        result = (self.rewriter.
                  rewrite_content(urlrewriter,
                                  status_headers,
                                  stream,
                                  head_insert_func=head_insert_func,
                                  urlkey=urlkey,
                                  cdx=cdx))

        if env:
            env['pywb.cdx'] = cdx

        return result
Exemple #19
0
    def fetch_request(self,
                      url,
                      urlrewriter,
                      head_insert_func=None,
                      urlkey=None,
                      env=None,
                      req_headers={},
                      timestamp=None,
                      follow_redirects=False,
                      skip_recording=False,
                      verify=True,
                      remote_only=True):

        ts_err = url.split('///')

        # fixup for accidental erroneous rewrite which has ///
        # (unless file:///)
        if len(ts_err) > 1 and ts_err[0] != 'file:':
            url = 'http://' + ts_err[1]

        if url.startswith('//'):
            url = 'http:' + url

        if remote_only or is_http(url):
            is_remote = True
        else:
            is_remote = False
            if not url.startswith('file:'):
                url = to_file_url(url)

        # explicit urlkey may be passed in (say for testing)
        if not urlkey:
            urlkey = canonicalize(url)

        if is_remote:
            (status_headers, stream) = self.fetch_http(url, urlkey, env,
                                                       req_headers,
                                                       follow_redirects,
                                                       skip_recording, verify)
        else:
            (status_headers, stream) = self.fetch_local_file(url)

        if timestamp is None:
            timestamp = timestamp_now()

        cdx = {
            'urlkey': urlkey,
            'timestamp': timestamp,
            'url': url,
            'status': status_headers.get_statuscode(),
            'mime': status_headers.get_header('Content-Type'),
            'is_live': True,
        }

        result = (self.rewriter.rewrite_content(
            urlrewriter,
            status_headers,
            stream,
            head_insert_func=head_insert_func,
            urlkey=urlkey,
            cdx=cdx))

        if env:
            env['pywb.cdx'] = cdx

        return result
Exemple #20
0
    def handle_download(self, user, coll, rec):
        collection = self.manager.get_collection(user, coll, rec)
        if not collection:
            self._raise_error(404, 'Collection not found', id=coll)

        now = timestamp_now()

        name = collection['id']
        if rec != '*':
            rec_list = rec.split(',')
            if len(rec_list) == 1:
                name = rec
            else:
                name += '-' + rec
        else:
            rec_list = None

        filename = self.download_filename.format(title=quote(name),
                                                 timestamp=now)
        loader = BlockLoader()

        coll_info = self.create_coll_warcinfo(user, collection, filename)

        def iter_infos():
            for recording in collection['recordings']:
                if rec_list and recording['id'] not in rec_list:
                    continue

                warcinfo = self.create_rec_warcinfo(user, collection,
                                                    recording, filename)

                size = len(warcinfo)
                size += recording['size']
                yield recording, warcinfo, size

        def read_all(infos):
            yield coll_info

            for recording, warcinfo, _ in infos:
                yield warcinfo

                for warc_path in self._iter_all_warcs(user, coll,
                                                      recording['id']):
                    try:
                        fh = loader.load(warc_path)
                    except:
                        print('Skipping invalid ' + warc_path)
                        continue

                    for chunk in StreamIter(fh):
                        yield chunk

        response.headers['Content-Type'] = 'application/octet-stream'
        response.headers[
            'Content-Disposition'] = "attachment; filename*=UTF-8''" + filename

        # if not transfer-encoding, store infos and calculate total size
        if not self.download_chunk_encoded:
            size = len(coll_info)
            infos = list(iter_infos())
            size += sum(size for r, i, size in infos)

            response.headers['Content-Length'] = size
            return read_all(infos)

        else:
            # stream everything
            response.headers['Transfer-Encoding'] = 'chunked'

            return read_all(iter_infos())
    def write_snapshot(self,
                       user,
                       coll,
                       url,
                       title,
                       html_text,
                       referrer,
                       user_agent,
                       browser=None):

        snap_title = 'Static Snapshots'

        snap_rec = self.sanitize_title(snap_title)

        if not self.manager.has_recording(user, coll, snap_rec):
            recording = self.manager.create_recording(user, coll, snap_rec,
                                                      snap_title)

        kwargs = dict(user=user,
                      coll=quote(coll),
                      rec=quote(snap_rec, safe='/*'),
                      type='snapshot')

        params = {'url': url}

        upstream_url = self.manager.content_app.get_upstream_url(
            '', kwargs, params)

        headers = {
            'Content-Type': 'text/html; charset=utf-8',
            'WARC-User-Agent': user_agent,
            'WARC-Referer': referrer,
        }

        r = requests.put(
            upstream_url,
            data=BytesIO(html_text.encode('utf-8')),
            headers=headers,
        )

        try:
            res = r.json()
            if res['success'] != 'true':
                print(res)
                return {'error_message': 'Snapshot Failed'}

            warc_date = res.get('WARC-Date')

        except Exception as e:
            print(e)
            return {'error_message': 'Snapshot Failed'}

        if not title:
            return {'snapshot': ''}

        if warc_date:
            timestamp = iso_date_to_timestamp(warc_date)
        else:
            timestamp = timestamp_now()

        page_data = {
            'url': url,
            'title': title,
            'timestamp': timestamp,
            'tags': ['snapshot'],
        }
        if browser:
            page_data['browser'] = browser

        res = self.manager.add_page(user, coll, snap_rec, page_data)

        return {'snapshot': page_data}