Example #1
0
    def merge_request_data(self, other, options):
        surt_ordered = options.get('surt_ordered', True)

        if other.record.rec_type != 'request':
            return False

        # two requests, not correct
        if self.record.rec_type == 'request':
            return False

        # merge POST/PUT body query
        post_query = other.get('_post_query')
        url = self['url']
        new_url = post_query.append_query(url)
        new_url = new_url.replace('WB_wombat_', '')
        if post_query and new_url != url:
            self['urlkey'] = canonicalize(new_url, surt_ordered)
            other['urlkey'] = self['urlkey']

            self['method'] = post_query.method
            self['requestBody'] = post_query.query

        referer = other.record.http_headers.get_header('referer')
        if referer:
            self['_referer'] = referer

        return True
Example #2
0
    def rewrite_record(self,
                       headers,
                       content,
                       ts,
                       url='http://example.com/',
                       prefix='http://localhost:8080/prefix/',
                       warc_headers=None,
                       request_url=None,
                       is_live=None):

        record = self._create_response_record(url, headers, content,
                                              warc_headers)

        wburl = WbUrl(ts + '/' + (request_url or url))
        url_rewriter = UrlRewriter(wburl, prefix)

        cdx = CDXObject()
        cdx['url'] = url
        cdx['timestamp'] = ts
        cdx['urlkey'] = canonicalize(url)
        if request_url != url:
            cdx['is_fuzzy'] = '1'
        cdx['is_live'] = is_live

        return self.content_rewriter(record, url_rewriter, None, cdx=cdx)
    def rewrite_record(self, headers, content, ts, url='http://example.com/',
                       prefix='http://localhost:8080/prefix/', warc_headers=None,
                       request_url=None, is_live=None, use_js_proxy=True, environ=None):

        record = self._create_response_record(url, headers, content, warc_headers)

        wburl = WbUrl(ts + '/' + (request_url or url))
        url_rewriter = UrlRewriter(wburl, prefix)

        cdx = CDXObject()
        cdx['url'] = url
        cdx['timestamp'] = ts
        cdx['urlkey'] = canonicalize(url)
        if request_url != url:
            cdx['is_fuzzy'] = '1'
        cdx['is_live'] = is_live

        def insert_func(rule, cdx):
            return ''

        if use_js_proxy:
            rewriter = self.js_proxy_content_rewriter
        else:
            rewriter = self.content_rewriter

        return rewriter(record, url_rewriter, cookie_rewriter=None,
                        head_insert_func=insert_func,
                        cdx=cdx,
                        environ=environ)
Example #4
0
    def rewrite_record(self, headers, content, ts, url='http://example.com/',
                       prefix='http://localhost:8080/prefix/', warc_headers=None,
                       request_url=None, is_live=None, use_js_proxy=True, environ=None):

        record = self._create_response_record(url, headers, content, warc_headers)

        wburl = WbUrl(ts + '/' + (request_url or url))
        url_rewriter = UrlRewriter(wburl, prefix)

        cdx = CDXObject()
        cdx['url'] = url
        cdx['timestamp'] = ts
        cdx['urlkey'] = canonicalize(url)
        if request_url != url:
            cdx['is_fuzzy'] = '1'
        cdx['is_live'] = is_live

        def insert_func(rule, cdx):
            return ''

        if use_js_proxy:
            rewriter = self.js_proxy_content_rewriter
        else:
            rewriter = self.content_rewriter

        return rewriter(record, url_rewriter, cookie_rewriter=None,
                        head_insert_func=insert_func,
                        cdx=cdx,
                        environ=environ)
Example #5
0
    def handle_timegate(self, params, timestamp):
        url = params['url']
        load_url = self.timegate_url.format(url=url, timestamp=timestamp)

        res = None
        try:
            headers = self._get_headers(params)
            res = self.sesh.head(load_url, headers=headers)
        except Exception as e:
            no_except_close(res)
            raise NotFoundException(url)

        if res and res.headers.get('Memento-Datetime'):
            if res.status_code >= 400:
                no_except_close(res)
                raise NotFoundException(url)

            if res.status_code >= 300:
                info = self._extract_location(url, res.headers.get('Location'))
            else:
                info = self._extract_location(
                    url, res.headers.get('Content-Location'))

            url, timestamp, load_url = info

        cdx = CDXObject()
        cdx['urlkey'] = canonicalize(url)
        cdx['timestamp'] = timestamp
        cdx['url'] = url
        cdx['load_url'] = load_url

        if 'Referer' in headers:
            cdx['set_referrer'] = headers['Referer']

        return iter([cdx])
Example #6
0
    def create_record_iter(self, arcv_iter):
        append_post = self.options.get('append_post')
        include_all = self.options.get('include_all')
        block_size = self.options.get('block_size', 16384)
        surt_ordered = self.options.get('surt_ordered', True)
        minimal = self.options.get('minimal')
        append_post = self.options.get('append_post')

        if append_post and minimal:
            raise Exception('Sorry, minimal index option and ' +
                            'append POST options can not be used together')

        for record in arcv_iter.iter_records(block_size):
            entry = None

            if not include_all and not minimal and (
                    record.status_headers.get_statuscode() == '-'):
                continue

            if record.format == 'warc':
                if (record.rec_type in ('request', 'warcinfo')
                        and not include_all and not append_post):
                    continue

                elif (not include_all
                      and record.content_type == 'application/warc-fields'):
                    continue

                entry = self.parse_warc_record(record)
            elif record.format == 'arc':
                entry = self.parse_arc_record(record)

            if not entry:
                continue

            if entry.get('url') and not entry.get('urlkey'):
                entry['urlkey'] = canonicalize(entry['url'], surt_ordered)

            compute_digest = False

            if (entry.get('digest', '-') == '-' and record.rec_type
                    not in ('revisit', 'request', 'warcinfo')):

                compute_digest = True

            elif not minimal and record.rec_type == 'request' and append_post:
                method = record.status_headers.protocol
                len_ = record.status_headers.get_header('Content-Length')

                post_query = extract_post_query(method, entry.get('mime'),
                                                len_, record.stream)

                entry['_post_query'] = post_query

            arcv_iter.read_to_end(record, compute_digest)
            entry.set_rec_info(*arcv_iter.member_info)
            entry.record = record

            yield entry
Example #7
0
 def get_params(self, url, actual_url, mime='text/html'):
     params = {
         'url': url,
         'cdx_url': actual_url,
         'key': canonicalize(url),
         'mime': mime
     }
     return params
Example #8
0
def create_record_iter(arcv_iter, options):
    append_post = options.get('append_post')
    include_all = options.get('include_all')
    block_size = options.get('block_size', 16384)

    for record in arcv_iter.iter_records(block_size):
        entry = None

        if not include_all and (record.status_headers.get_statuscode() == '-'):
            continue

        if record.format == 'warc':
            if (record.rec_type in ('request', 'warcinfo') and
                 not include_all and
                 not append_post):
                continue

            elif (not include_all and
                  record.content_type == 'application/warc-fields'):
                continue

            entry = parse_warc_record(record)
        elif record.format == 'arc':
            entry = parse_arc_record(record)

        if not entry:
            continue

        if entry.url and not entry.key:
            entry.key = canonicalize(entry.url,
                                     options.get('surt_ordered', True))

        compute_digest = False

        if (entry.digest == '-' and
            record.rec_type not in ('revisit', 'request', 'warcinfo')):

            compute_digest = True

        elif record.rec_type == 'request' and options.get('append_post'):
            method = record.status_headers.protocol
            len_ = record.status_headers.get_header('Content-Length')

            post_query = extract_post_query(method,
                                            entry.mime,
                                            len_,
                                            record.stream)

            entry.post_query = post_query

        #entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
        arcv_iter.read_to_end(record, compute_digest)
        entry.set_rec_info(*arcv_iter.member_info)
        entry.record = record

        yield entry
Example #9
0
    def fetch_request(self,
                      url,
                      urlrewriter,
                      head_insert_func=None,
                      urlkey=None,
                      env=None,
                      req_headers={},
                      timestamp=None,
                      follow_redirects=False,
                      proxies=None):

        ts_err = url.split('///')

        # fixup for accidental erroneous rewrite which has ///
        # (unless file:///)
        if len(ts_err) > 1 and ts_err[0] != 'file:':
            url = 'http://' + ts_err[1]

        if url.startswith('//'):
            url = 'http:' + url

        if is_http(url):
            (status_headers, stream) = self.fetch_http(url, env, req_headers,
                                                       follow_redirects,
                                                       proxies)
        else:
            (status_headers, stream) = self.fetch_local_file(url)

        # explicit urlkey may be passed in (say for testing)
        if not urlkey:
            urlkey = canonicalize(url)

        if timestamp is None:
            timestamp = datetime_to_timestamp(datetime.datetime.utcnow())

        cdx = {
            'urlkey': urlkey,
            'timestamp': timestamp,
            'original': url,
            'statuscode': status_headers.get_statuscode(),
            'mimetype': status_headers.get_header('Content-Type'),
            'is_live': True,
        }

        result = (self.rewriter.rewrite_content(
            urlrewriter,
            status_headers,
            stream,
            head_insert_func=head_insert_func,
            urlkey=urlkey,
            cdx=cdx))

        if env:
            env['pywb.cdx'] = cdx

        return result
Example #10
0
 def __init__(self, url):
     self.url = url
     split = canonicalize(url).split(')/')
     dd = split[0].split(',')
     if len(dd) > 2:
         self.domain = ','.join(dd[:2])
     else:
         self.domain = split[0]
     self.path = split[1]
     self.pathLen = len(self.path.split('/'))
Example #11
0
    def get_expected(self, url, mime='text/html', filters=None):
        filters = filters or {'urlkey:'}
        exp = [{'filter': filters,
               'is_fuzzy': '1',
               'urlkey': canonicalize(url),
               'source': 'source',
               'source-coll': 'source',
               'url': url,
               'mime': mime}]

        return exp
    def render_content(self, wbrequest):
        if wbrequest.wb_url.mod == 'vi_':
            return self._get_video_info(wbrequest)

        ref_wburl_str = wbrequest.extract_referrer_wburl_str()
        if ref_wburl_str:
            wbrequest.env['HTTP_REFERER'] = WbUrl(ref_wburl_str).url

        urlkey = canonicalize(wbrequest.wb_url.url)
        url = wbrequest.wb_url.url

        inputreq = RewriteInputRequest(wbrequest.env, urlkey, url,
                                       self.content_rewriter)

        req_data = inputreq.reconstruct_request(url)

        headers = {'Content-Length': len(req_data),
                   'Content-Type': 'application/request'}

        if wbrequest.wb_url.is_latest_replay():
            closest = 'now'
        else:
            closest = wbrequest.wb_url.timestamp

        upstream_url = self.upstream_url.format(url=quote(url),
                                                closest=closest,
                                                #coll=wbrequest.coll,
                                                **wbrequest.matchdict)

        r = requests.post(upstream_url,
                          data=BytesIO(req_data),
                          headers=headers,
                          stream=True,
                          allow_redirects=False)

        r.raise_for_status()

        record = self.loader.parse_record_stream(r.raw)

        cdx = CDXObject()
        cdx['urlkey'] = urlkey
        cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime'))
        cdx['url'] = url

        head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
        result = self.content_rewriter.rewrite_content(wbrequest.urlrewriter,
                                               record.status_headers,
                                               record.stream,
                                               head_insert_func,
                                               urlkey,
                                               cdx)

        status_headers, gen, is_rw = result
        return self._make_response(wbrequest, *result)
Example #13
0
def create_record_iter(arcv_iter, options):
    append_post = options.get('append_post')
    include_all = options.get('include_all')
    block_size = options.get('block_size', 16384)

    for record in arcv_iter.iter_records(block_size):
        entry = None

        if not include_all and (record.status_headers.get_statuscode() == '-'):
            continue

        if record.format == 'warc':
            if (record.rec_type in ('request', 'warcinfo') and not include_all
                    and not append_post):
                continue

            elif (not include_all
                  and record.content_type == 'application/warc-fields'):
                continue

            entry = parse_warc_record(record)
        elif record.format == 'arc':
            entry = parse_arc_record(record)

        if not entry:
            continue

        if entry.url and not entry.key:
            entry.key = canonicalize(entry.url,
                                     options.get('surt_ordered', True))

        compute_digest = False

        if (entry.digest == '-'
                and record.rec_type not in ('revisit', 'request', 'warcinfo')):

            compute_digest = True

        elif record.rec_type == 'request' and options.get('append_post'):
            method = record.status_headers.protocol
            len_ = record.status_headers.get_header('Content-Length')

            post_query = extract_post_query(method, entry.mime, len_,
                                            record.stream)

            entry.post_query = post_query

        #entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
        arcv_iter.read_to_end(record, compute_digest)
        entry.set_rec_info(*arcv_iter.member_info)
        entry.record = record

        yield entry
Example #14
0
    def fetch_request(self, url, urlrewriter,
                      head_insert_func=None,
                      urlkey=None,
                      env=None,
                      req_headers={},
                      timestamp=None,
                      follow_redirects=False,
                      proxies=None):

        ts_err = url.split('///')

        # fixup for accidental erroneous rewrite which has ///
        # (unless file:///)
        if len(ts_err) > 1 and ts_err[0] != 'file:':
            url = 'http://' + ts_err[1]

        if url.startswith('//'):
            url = 'http:' + url

        if is_http(url):
            (status_headers, stream) = self.fetch_http(url, env, req_headers,
                                                       follow_redirects,
                                                       proxies)
        else:
            (status_headers, stream) = self.fetch_local_file(url)

        # explicit urlkey may be passed in (say for testing)
        if not urlkey:
            urlkey = canonicalize(url)

        if timestamp is None:
            timestamp = datetime_to_timestamp(datetime.datetime.utcnow())

        cdx = {'urlkey': urlkey,
               'timestamp': timestamp,
               'original': url,
               'statuscode': status_headers.get_statuscode(),
               'mimetype': status_headers.get_header('Content-Type'),
               'is_live': True,
              }

        result = (self.rewriter.
                  rewrite_content(urlrewriter,
                                  status_headers,
                                  stream,
                                  head_insert_func=head_insert_func,
                                  urlkey=urlkey,
                                  cdx=cdx))

        if env:
            env['pywb.cdx'] = cdx

        return result
Example #15
0
 def canonize_grounding(instance):
     if "grounding_urls" in instance:
         try:
             instance["grounding_urls"] = [
                 url.lower() for url in instance["grounding_urls"]
             ]
             instance["grounding_canonical_urls"] = [
                 canonicalize(url) for url in instance["grounding_urls"]
             ]
         except Exception as e:
             logging.error("Could not canonize grounding URLs: " + str(e))
             return
     yield instance
 def convert_to_cdx(self, item, urlkey, url):
     cdx = CDXObject()
     cdx['urlkey'] = canonicalize(url)
     cdx['timestamp'] = gettext(item, 'tstamp')[:14]
     cdx['url'] = url
     cdx['mime'] = gettext(item, 'primaryType') + '/' + gettext(item, 'subType')
     cdx['status'] = '-'
     cdx['digest'] = gettext(item, 'digest')
     #cdx['length'] = gettext(item, 'contentLength')
     cdx['length'] = '-'
     cdx['offset'] = gettext(item, 'arcoffset')
     cdx['filename'] = gettext(item, 'arcname') + '.arc.gz'
     return cdx
Example #17
0
    def load_index(self, params):
        # return nothing for exact match to force fuzzy
        if params.get('matchType', 'exact') == 'exact':
            return iter([])

        cdx = {
            'urlkey': canonicalize(params.get('cdx_url')),
            'mime': params.get('mime'),
            'filter': params.get('filter'),
            'url': params.get('cdx_url'),
        }

        return iter([cdx])
Example #18
0
    def memento_to_cdx(self, url, mem_iter, limit, skip_exclude=True):
        key = canonicalize(url)
        if url.endswith('/'):
            key += '/'

        for mems, _ in itertools.izip(mem_iter, xrange(0, limit)):

            if len(mems) > 1:
                mem, next_, prev_, first_, last_ = mems
            else:
                mem = mems[0]

            excluded = False

            if isinstance(mem.url, list):
                mem_list = mem.url
                count = len(mem_list)
                if count > 1:
                    mem_list = self.sort_archives(mem_list)
            else:
                mem_list = [mem.url]
                count = 1

            for mem_url in mem_list:
                mem_url = mem_url.encode('utf-8')
                # handle scheme relative urls
                if mem_url.startswith('//'):
                    mem_url = 'http:' + mem_url

                if mem_url.startswith(EXCLUDE_LIST):
                    if skip_exclude:
                        continue
                    else:
                        excluded = True

                cdx = {}
                cdx['urlkey'] = key
                cdx['timestamp'] = mem.ts
                cdx['url'] = url
                cdx['src_url'] = mem_url
                cdx['sec'] = mem.sec
                cdx['src_host'] = urlsplit(mem_url).netloc
                cdx['excluded'] = excluded
                cdx['dupes'] = count

                if len(mems) > 1:
                    cdx['first'] = first_.ts if first_ else ''
                    cdx['last'] = last_.ts if last_ else ''
                    cdx['next'] = next_.ts if next_ else ''
                    cdx['prev'] = prev_.ts if prev_ else ''
                yield cdx
    def memento_to_cdx(self, url, mem_iter, limit, skip_exclude=True):
        key = canonicalize(url)
        if url.endswith("/"):
            key += "/"

        for mems, _ in itertools.izip(mem_iter, xrange(0, limit)):

            if len(mems) > 1:
                mem, next_, prev_, first_, last_ = mems
            else:
                mem = mems[0]

            excluded = False

            if isinstance(mem.url, list):
                mem_list = mem.url
                count = len(mem_list)
                if count > 1:
                    mem_list = self.sort_archives(mem_list)
            else:
                mem_list = [mem.url]
                count = 1

            for mem_url in mem_list:
                mem_url = mem_url.encode("utf-8")
                # handle scheme relative urls
                if mem_url.startswith("//"):
                    mem_url = "http:" + mem_url

                if mem_url.startswith(EXCLUDE_LIST):
                    if skip_exclude:
                        continue
                    else:
                        excluded = True

                cdx = {}
                cdx["urlkey"] = key
                cdx["timestamp"] = mem.ts
                cdx["url"] = url
                cdx["src_url"] = mem_url
                cdx["sec"] = mem.sec
                cdx["src_host"] = urlsplit(mem_url).netloc
                cdx["excluded"] = excluded
                cdx["dupes"] = count

                if len(mems) > 1:
                    cdx["first"] = first_.ts if first_ else ""
                    cdx["last"] = last_.ts if last_ else ""
                    cdx["next"] = next_.ts if next_ else ""
                    cdx["prev"] = prev_.ts if prev_ else ""
                yield cdx
Example #20
0
 def convert_to_cdx(self, item, urlkey, url):
     cdx = CDXObject()
     cdx['urlkey'] = canonicalize(url)
     cdx['timestamp'] = gettext(item, 'tstamp')[:14]
     cdx['url'] = url
     cdx['mime'] = gettext(item, 'primaryType') + '/' + gettext(
         item, 'subType')
     cdx['status'] = '-'
     cdx['digest'] = gettext(item, 'digest')
     #cdx['length'] = gettext(item, 'contentLength')
     cdx['length'] = '-'
     cdx['offset'] = gettext(item, 'arcoffset')
     cdx['filename'] = gettext(item, 'arcname') + '.arc.gz'
     return cdx
Example #21
0
    def convert_line(self, line, url):
        timestamp, mime, filename = line.split('\t')

        cdx = CDXObject()
        cdx['urlkey'] = canonicalize(url)
        cdx['timestamp'] = timestamp
        cdx['original'] = url
        cdx['mimetype'] = mime
        cdx['statuscode'] = '200'
        cdx['digest'] = '-'
        cdx['length'] = '-1'
        cdx['offset'] = '0'
        cdx['filename'] = filename
        return cdx
Example #22
0
    def load_cdx(self, **params):
        closest = params.get('closest')

        self.check_url(params)

        if closest:
            query = self._get_closest_query(params)
        else:
            query = self._get_timemap_query(params)

        query = quote_plus(query) + self.CLOSEST_QUERY_FIXED
        full_url = self.opensearch_query + '?query=' + query
        print('QUERY', full_url)

        output = params.get('output', 'text')
        url = params.get('url')
        urlkey = canonicalize(url)

        try:
            response = requests.get(full_url, stream=True)
            buff = response.raw.read()
            response.raw.close()
        except Exception as e:
            import traceback
            traceback.print_exc(e)
            raise WbException(e)

        results = etree.fromstring(buff)

        items = results.find('channel').findall('item')

        cdx_list = [self.convert_to_cdx(item, urlkey, url) for item in items]

        if not cdx_list:
            raise NotFoundException('url {0} not found'.format(url))

        if closest:
            cdx_list = cdx_sort_closest(closest, cdx_list, limit=10000)
            #lets print the list and the closest for debug
        else:
            cdx_list = cdx_sort_closest(EARLIEST_DATE, cdx_list, limit=10000)

        if output == 'text':
            cdx_list = [str(cdx) + '\n' for cdx in cdx_list]
        elif output == 'json':
            fields = params.get('fl', '').split(',')
            cdx_list = [cdx.to_json(fields) for cdx in cdx_list]

        return iter(cdx_list)
Example #23
0
    def load_cdx(self, **params):
        closest = params.get('closest')

        self.check_url(params)

        if closest:
            query = self._get_closest_query(params)
        else:
            query = self._get_timemap_query(params)

        query = quote_plus(query) + self.CLOSEST_QUERY_FIXED
        full_url = self.opensearch_query + '?query=' + query
        print('QUERY', full_url)

        output = params.get('output', 'text')
        url = params.get('url')
        urlkey = canonicalize(url)

        try:
            response = requests.get(full_url, stream=True)
            buff = response.raw.read()
            response.raw.close()
        except Exception as e:
            import traceback
            traceback.print_exc(e)
            raise WbException(e)

        results = etree.fromstring(buff)

        items = results.find('channel').findall('item')

        cdx_list = [self.convert_to_cdx(item, urlkey, url) for item in items]

        if not cdx_list:
            raise NotFoundException('url {0} not found'.format(url))

        if closest:
            cdx_list = cdx_sort_closest(closest, cdx_list, limit=10000)
            #lets print the list and the closest for debug
        else:
            cdx_list = cdx_sort_closest(EARLIEST_DATE, cdx_list, limit=10000)

        if output == 'text':
            cdx_list = [str(cdx) + '\n' for cdx in cdx_list]
        elif output == 'json':
            fields = params.get('fl', '').split(',')
            cdx_list = [cdx.to_json(fields) for cdx in cdx_list]

        return iter(cdx_list)
Example #24
0
    def get_html(self, url, closest_datetime_str=None):
        canonical = canonicalize(url)
        metas = self.meta_index.get(canonical)
        if not metas: return None
        metas = [json.loads(m) for m in metas]
        metas = [m for m in metas if m['status'] == "200"]
        if len(metas) > 1 and closest_datetime_str and sort_by_closest(
                metas, parse_date(closest_datetime_str)):
            pass  # successfully sorted metas by reference time
        else:
            metas = sorted(metas, key=lambda m: m['filename'], reverse=True)

        html = get_first_or_none(
            map(CommonCrawlS3.fetch_html_from_s3_file, metas))
        return html
Example #25
0
    def to_key(self, url_or_surt, exact_match=False):
        """ If 'url_or_surt' already a SURT, use as is
        If exact match, add the exact match suffix

        :param str url_or_surt: The url or surt to be converted to an acl key
        :param bool exact_match: Should the exact match suffix be added to key
        :rtype: str
        """
        if self.SURT_RX.search(url_or_surt):
            result = url_or_surt
        else:
            result = canonicalize(url_or_surt)

        if exact_match:
            result += AccessChecker.EXACT_SUFFIX

        return result
Example #26
0
    def convert_to_cdxj(self):
        cdxj_writer = CDXJ()
        for filename in self.iter_cdx_files():
            outfile = filename + 'j'

            print('Converting {0} -> {1}'.format(filename, outfile))

            with open(outfile + '.tmp', 'w+b') as out:
                with open(filename) as fh:
                    for line in fh:
                        if line.startswith(' CDX'):
                            continue
                        cdx = CDXObject(line)
                        cdx[URLKEY] = canonicalize(cdx[ORIGINAL])
                        cdxj_writer.write_cdx_line(out, cdx, cdx['filename'])

            shutil.move(outfile + '.tmp', outfile)
            os.remove(filename)
Example #27
0
 def convert_to_cdx(self, item, urlkey, url):
     cdx = CDXObject()
     cdx['timestamp'] = gettext(item, 'tstamp')[:14]
     for elem in item.iter():
         if(elem.tag == "source"):
             url = elem.attrib['url']
     
     cdx['url'] = url
     cdx['urlkey'] = canonicalize(url)                
     #print("URL?" + etree.tostring(item))        
     cdx['mime'] = gettext(item, 'primaryType') + '/' + gettext(item, 'subType')
     cdx['status'] = '-'
     cdx['digest'] = gettext(item, 'digest')
     #cdx['length'] = gettext(item, 'contentLength')
     cdx['length'] = '-'
     cdx['offset'] = gettext(item, 'arcoffset')
     cdx['filename'] = gettext(item, 'arcname') + '.arc.gz'
     
     return cdx
Example #28
0
def transform_cdx(cdx_path):
    with open(cdx_path, mode="r") as cdxfile:
        for line in cdxfile:
            record_dict = defaultdict()
            record_list = line.split(' ')

            # build dict
            record_dict['url'] = record_list[2]
            record_dict['mime'] = record_list[3]
            record_dict['status'] = record_list[4]
            record_dict['digest'] = record_list[5]
            record_dict['length'] = '0'
            record_dict['offset'] = record_list[7]
            record_dict['filename'] = record_list[8].replace('\n', '')
            try:
                print "{} {} {}".format(
                    canonicalize(record_list[0], surt_ordered=True),
                    record_list[1], json.dumps(record_dict))
            except ValueError as e:
                print "Header"
Example #29
0
    def merge_request_data(self, other, options):
        surt_ordered = options.get('surt_ordered', True)

        if other.record.rec_type != 'request':
            return False

        # two requests, not correct
        if self.record.rec_type == 'request':
            return False

        # merge POST/PUT body query
        if hasattr(other, 'post_query'):
            url = append_post_query(self.url, other.post_query)
            self.key = canonicalize(url, surt_ordered)
            other.key = self.key

        referer = other.record.status_headers.get_header('referer')
        if referer:
            self.referer = referer

        return True
Example #30
0
    def merge_request_data(self, other, options):
        surt_ordered = options.get('surt_ordered', True)

        if other.record.rec_type != 'request':
            return False

        # two requests, not correct
        if self.record.rec_type == 'request':
            return False

        # merge POST/PUT body query
        if hasattr(other, 'post_query'):
            url = append_post_query(self.url, other.post_query)
            self.key = canonicalize(url, surt_ordered)
            other.key = self.key

        referer = other.record.status_headers.get_header('referer')
        if referer:
            self.referer = referer

        return True
Example #31
0
    def links_to_cdxobject(self, link_header, def_name):
        results = MementoUtils.parse_links(link_header, def_name)

        original = results['original']['url']
        key = canonicalize(original)

        mementos = results['mementos']

        for val in mementos:
            dt = val['datetime']
            ts = http_date_to_timestamp(dt)
            cdx = CDXObject()
            cdx['urlkey'] = key
            cdx['timestamp'] = ts
            cdx['url'] = original
            cdx['mem_rel'] = val.get('rel', '')
            cdx['memento_url'] = val['url']

            load_url = self._get_replay_url(cdx['timestamp'], original)

            cdx['load_url'] = load_url
            yield cdx
Example #32
0
    def merge_request_data(self, other, options):
        surt_ordered = options.get('surt_ordered', True)

        if other.record.rec_type != 'request':
            return False

        # two requests, not correct
        if self.record.rec_type == 'request':
            return False

        # merge POST/PUT body query
        post_query = other.get('_post_query')
        if post_query:
            url = append_post_query(self['url'], post_query)
            self['urlkey'] = canonicalize(url, surt_ordered)
            other['urlkey'] = self['urlkey']

        referer = other.record.status_headers.get_header('referer')
        if referer:
            self['_referer'] = referer

        return True
Example #33
0
    def merge_request_data(self, other, options):
        surt_ordered = options.get('surt_ordered', True)

        if other.record.rec_type != 'request':
            return False

        # two requests, not correct
        if self.record.rec_type == 'request':
            return False

        # merge POST/PUT body query
        post_query = other.get('_post_query')
        if post_query:
            url = append_post_query(self['url'], post_query)
            self['urlkey'] = canonicalize(url, surt_ordered)
            other['urlkey'] = self['urlkey']

        referer = other.record.status_headers.get_header('referer')
        if referer:
            self['_referer'] = referer

        return True
Example #34
0
    def raise_on_self_redirect(self, params, cdx, status_code, location_url):
        """
        Check if response is a 3xx redirect to the same url
        If so, reject this capture to avoid causing redirect loop
        """
        if cdx.get('is_live'):
            return

        if not status_code.startswith('3') or status_code == '304':
            return

        request_url = params['url'].lower()
        if not location_url:
            return


        location_url = location_url.lower()
        if location_url.startswith('/'):
            host = urlsplit(cdx['url']).netloc
            location_url = host + location_url

        location_url = location_url.split('://', 1)[-1].rstrip('/')
        request_url = request_url.split('://', 1)[-1].rstrip('/')

        self_redir = False

        if request_url == location_url:
            self_redir = True
        elif params.get('sr-urlkey'):
            # if new location canonicalized matches old key, also self-redirect
            if canonicalize(location_url) == params.get('sr-urlkey'):
                self_redir = True

        if self_redir:
            msg = 'Self Redirect {0} -> {1}'
            msg = msg.format(request_url, location_url)
            params['sr-urlkey'] = cdx['urlkey']
            raise LiveResourceException(msg)
Example #35
0
    def raise_on_self_redirect(self, params, cdx, status_code, location_url):
        """
        Check if response is a 3xx redirect to the same url
        If so, reject this capture to avoid causing redirect loop
        """
        if cdx.get('is_live'):
            return

        if not status_code.startswith('3') or status_code == '304':
            return

        request_url = params['url'].lower()
        if not location_url:
            return

        location_url = location_url.lower()
        if location_url.startswith('/'):
            host = urlsplit(cdx['url']).netloc
            location_url = host + location_url

        location_url = location_url.split('://', 1)[-1].rstrip('/')
        request_url = request_url.split('://', 1)[-1].rstrip('/')

        self_redir = False
        orig_key = params.get('sr-urlkey') or cdx['urlkey']

        if request_url == location_url:
            self_redir = True

        # if new location canonicalized matches old key, also self-redirect
        elif canonicalize(location_url) == orig_key:
            self_redir = True

        if self_redir:
            msg = 'Self Redirect {0} -> {1}'
            msg = msg.format(request_url, location_url)
            params['sr-urlkey'] = orig_key
            raise LiveResourceException(msg)
Example #36
0
    def merge_request_data(self, other, options):
        surt_ordered = options.get('surt_ordered', True)

        if other.record.rec_type != 'request':
            return False

        # two requests, not correct
        if self.record.rec_type == 'request':
            return False

        # merge POST/PUT body query
        post_query = other.get('_post_query')
        url = self['url']
        new_url = post_query.append_query(url)
        new_url = new_url.replace('WB_wombat_', '')
        if post_query and new_url != url:
            self['urlkey'] = canonicalize(new_url, surt_ordered)
            other['urlkey'] = self['urlkey']

        referer = other.record.http_headers.get_header('referer')
        if referer:
            self['_referer'] = referer

        return True
Example #37
0
    def create_record_iter(self, arcv_iter):
        append_post = self.options.get('append_post')
        include_all = self.options.get('include_all')
        block_size = self.options.get('block_size', 16384)
        surt_ordered = self.options.get('surt_ordered', True)
        minimal = self.options.get('minimal')

        if append_post and minimal:
            raise Exception('Sorry, minimal index option and ' +
                            'append POST options can not be used together')

        for record in arcv_iter.iter_records(block_size):
            entry = None

            if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'):
                continue

            if record.format == 'warc':
                if (record.rec_type in ('request', 'warcinfo') and
                     not include_all and
                     not append_post):
                    continue

                elif (not include_all and
                      record.content_type == 'application/warc-fields'):
                    continue

                entry = self.parse_warc_record(record)
            elif record.format == 'arc':
                entry = self.parse_arc_record(record)

            if not entry:
                continue

            if entry.get('url') and not entry.get('urlkey'):
                entry['urlkey'] = canonicalize(entry['url'], surt_ordered)

            compute_digest = False

            if (entry.get('digest', '-') == '-' and
                record.rec_type not in ('revisit', 'request', 'warcinfo')):

                compute_digest = True

            elif not minimal and record.rec_type == 'request' and append_post:
                method = record.status_headers.protocol
                len_ = record.status_headers.get_header('Content-Length')

                post_query = extract_post_query(method,
                                                entry.get('mime'),
                                                len_,
                                                record.stream)

                entry['_post_query'] = post_query

            entry.record = record

            self.begin_payload(compute_digest, entry)
            arcv_iter.read_to_end(record, self.handle_payload)

            entry.set_rec_info(*arcv_iter.member_info)
            self.end_payload(entry)

            yield entry
Example #38
0
    def create_record_iter(self, raw_iter):
        append_post = self.options.get('append_post')
        include_all = self.options.get('include_all')
        surt_ordered = self.options.get('surt_ordered', True)
        minimal = self.options.get('minimal')

        if append_post and minimal:
            raise Exception('Sorry, minimal index option and ' +
                            'append POST options can not be used together')

        for record in raw_iter:
            entry = None

            if not include_all and not minimal and (record.http_headers.get_statuscode() == '-'):
                continue

            if record.rec_type == 'arc_header':
                continue

            if record.format == 'warc':
                if (record.rec_type in ('request', 'warcinfo') and
                     not include_all and
                     not append_post):
                    continue

                elif (not include_all and
                      record.content_type == 'application/warc-fields'):
                    continue

                entry = self.parse_warc_record(record)
            elif record.format == 'arc':
                entry = self.parse_arc_record(record)

            if not entry:
                continue

            if entry.get('url') and not entry.get('urlkey'):
                entry['urlkey'] = canonicalize(entry['url'], surt_ordered)

            compute_digest = False

            if (entry.get('digest', '-') == '-' and
                record.rec_type not in ('revisit', 'request', 'warcinfo')):

                compute_digest = True

            elif not minimal and record.rec_type == 'request' and append_post:
                method = record.http_headers.protocol
                len_ = record.http_headers.get_header('Content-Length')

                post_query = MethodQueryCanonicalizer(method,
                                                entry.get('_content_type'),
                                                len_,
                                                record.raw_stream)

                entry['_post_query'] = post_query

            entry.record = record

            self.begin_payload(compute_digest, entry)

            while True:
                buff = record.raw_stream.read(BUFF_SIZE)
                if not buff:
                    break
                self.handle_payload(buff)

            raw_iter.read_to_end(record)

            entry.set_rec_info(*raw_iter.member_info)
            self.end_payload(entry)

            yield entry
Example #39
0
def test_memento_to_cdx(url, mem):
    key = canonicalize(url)
    for ts, target in mem:
        yield key + ' ' + ts + ' ' + url + ' ' + target
Example #40
0
 def get_url_key_p(ts, url):
     key = ts + '/' + canonicalize(url, False)
     if not url.endswith('/'):
         key += '/'
     return key
Example #41
0
    def process_record(self, record, flow):
        headers = flow.response.headers
        url = flow.request.req_url
        scheme = flow.request.req_scheme

        if not self.content_rewriter:
            return record.http_headers, StreamIO(record.raw_stream)

        cookie_rewriter = None

        template_params = flow.extra_data

        environ = {
            'pywb_proxy_magic': self.proxy_magic,
            'webrec.template_params': template_params
        }

        wb_url = WbUrl(url)
        wb_prefix = ''
        host_prefix = flow.request.req_scheme + '://' + self.proxy_magic
        urlrewriter = SchemeOnlyUrlRewriter(wb_url, '')

        if flow.request.headers.get('X-Requested-With',
                                    '').lower() == 'xmlhttprequest':
            urlrewriter.rewrite_opts['is_ajax'] = True

        head_insert_func = (self.head_insert_view.create_insert_func(
            wb_url, wb_prefix, host_prefix, url, environ, False))

        urlkey = canonicalize(wb_url.url)

        cdx = CDXObject()
        cdx['urlkey'] = urlkey
        cdx['timestamp'] = http_date_to_timestamp(
            headers.get('Memento-Datetime'))
        cdx['url'] = wb_url.url
        if headers.get('Webagg-Source-Coll') == 'live':
            cdx['is_live'] = 'true'

        result = self.content_rewriter.rewrite_content(
            urlrewriter, record.http_headers, record.raw_stream,
            head_insert_func, urlkey, cdx, cookie_rewriter, environ)

        status_headers, gen, is_rw = result

        status_headers.remove_header('Content-Security-Policy')

        # check for content-length
        res = status_headers.get_header('content-length')
        try:
            if int(res) > 0:
                return status_headers, IterIdent(gen)
        except:
            pass

        # need to either chunk or buffer to get content-length
        if flow.request.http_version == 'HTTP/1.1':
            status_headers.remove_header('content-length')
            status_headers.headers.append(('Transfer-Encoding', 'chunked'))
            #gen = chunk_encode_iter(gen)
        else:
            gen = buffer_iter(status_headers, gen)

        return status_headers, IterIdent(gen)
 def get_url_key_p(ts, url):
     key = ts + "/" + canonicalize(url, False)
     if not url.endswith("/"):
         key += "/"
     return key
Example #43
0
    def create_record_iter(self, raw_iter):
        append_post = self.options.get('append_post')
        include_all = self.options.get('include_all')
        surt_ordered = self.options.get('surt_ordered', True)
        minimal = self.options.get('minimal')

        if append_post and minimal:
            raise Exception('Sorry, minimal index option and ' +
                            'append POST options can not be used together')

        for record in raw_iter:
            entry = None

            if not include_all and not minimal and (
                    record.http_headers.get_statuscode() == '-'):
                continue

            if record.rec_type == 'arc_header':
                continue

            if record.format == 'warc':
                if (record.rec_type in ('request', 'warcinfo')
                        and not include_all and not append_post):
                    continue

                elif (not include_all
                      and record.content_type == 'application/warc-fields'):
                    continue

                entry = self.parse_warc_record(record)
            elif record.format == 'arc':
                entry = self.parse_arc_record(record)

            if not entry:
                continue

            if entry.get('url') and not entry.get('urlkey'):
                entry['urlkey'] = canonicalize(entry['url'], surt_ordered)

            compute_digest = False

            if (entry.get('digest', '-') == '-' and record.rec_type
                    not in ('revisit', 'request', 'warcinfo')):

                compute_digest = True

            elif not minimal and record.rec_type == 'request' and append_post:
                method = record.http_headers.protocol
                len_ = record.http_headers.get_header('Content-Length')

                post_query = MethodQueryCanonicalizer(
                    method, entry.get('_content_type'), len_,
                    record.raw_stream)

                entry['_post_query'] = post_query

            entry.record = record

            self.begin_payload(compute_digest, entry)

            while True:
                buff = record.raw_stream.read(BUFF_SIZE)
                if not buff:
                    break
                self.handle_payload(buff)

            raw_iter.read_to_end(record)

            entry.set_rec_info(*raw_iter.member_info)
            self.end_payload(entry)

            yield entry
Example #44
0
    def render_content(self, wb_url, kwargs, environ):
        wb_url = WbUrl(wb_url)

        host_prefix = self.get_host_prefix(environ)
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix

        resp = self.handle_custom_response(environ, wb_url,
                                           full_prefix, host_prefix, kwargs)
        if resp is not None:
            content_type = 'text/html'

            # if not replay outer frame, specify utf-8 charset
            if not self.is_framed_replay(wb_url):
                content_type += '; charset=utf-8'

            return WbResponse.text_response(resp, content_type=content_type)

        urlrewriter = UrlRewriter(wb_url,
                                  prefix=full_prefix,
                                  full_prefix=full_prefix,
                                  rel_prefix=rel_prefix)

        self.unrewrite_referrer(environ)

        urlkey = canonicalize(wb_url.url)

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url,
                                       self.content_rewriter)

        inputreq.include_post_query(wb_url.url)

        mod_url = None
        use_206 = False
        rangeres = None

        readd_range = False
        async_record_url = None

        if kwargs.get('type') in ('record', 'patch'):
            rangeres = inputreq.extract_range()

            if rangeres:
                mod_url, start, end, use_206 = rangeres

                # if bytes=0- Range request,
                # simply remove the range and still proxy
                if start == 0 and not end and use_206:
                    wb_url.url = mod_url
                    inputreq.url = mod_url

                    del environ['HTTP_RANGE']
                    readd_range = True
                else:
                    async_record_url = mod_url

        skip = async_record_url is not None

        setcookie_headers = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            res = self.cookie_tracker.get_cookie_headers(wb_url.url, cookie_key)
            inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
                r.raw.close()
            except:
                pass

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            raise UpstreamException(r.status_code, url=wb_url.url, details=details)

        if async_record_url:
            environ.pop('HTTP_RANGE', '')
            gevent.spawn(self._do_async_req,
                         inputreq,
                         async_record_url,
                         wb_url,
                         kwargs,
                         False)

        record = self.loader.parse_record_stream(r.raw)

        cdx = CDXObject()
        cdx['urlkey'] = urlkey
        cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime'))
        cdx['url'] = wb_url.url

        self._add_custom_params(cdx, r.headers, kwargs)

        if readd_range:
            content_length = (record.status_headers.
                              get_header('Content-Length'))
            try:
                content_length = int(content_length)
                record.status_headers.add_range(0, content_length,
                                                   content_length)
            except (ValueError, TypeError):
                pass

        if self.is_ajax(environ):
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.
                                    create_insert_func(wb_url,
                                                       full_prefix,
                                                       host_prefix,
                                                       top_url,
                                                       environ,
                                                       self.framed_replay))

        cookie_rewriter = None
        if self.cookie_tracker:
            cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
                                                               cookie_key)

        result = self.content_rewriter.rewrite_content(urlrewriter,
                                               record.status_headers,
                                               record.stream,
                                               head_insert_func,
                                               urlkey,
                                               cdx,
                                               cookie_rewriter,
                                               environ)

        status_headers, gen, is_rw = result

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        return WbResponse(status_headers, gen)
Example #45
0
    def render_content(self, wb_url, kwargs, environ):
        wb_url = wb_url.replace('#', '%23')
        wb_url = WbUrl(wb_url)
        is_timegate = self._check_accept_dt(wb_url, environ)

        host_prefix = self.get_host_prefix(environ)
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix

        is_proxy = ('wsgiprox.proxy_host' in environ)

        response = self.handle_custom_response(environ, wb_url,
                                               full_prefix, host_prefix,
                                               kwargs)

        if response:
            return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy)

        if is_proxy:
            environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
            urlrewriter = IdentityUrlRewriter(wb_url, '')
            framed_replay = False

        else:
            urlrewriter = UrlRewriter(wb_url,
                                      prefix=full_prefix,
                                      full_prefix=full_prefix,
                                      rel_prefix=rel_prefix)

            framed_replay = self.framed_replay

        url_parts = urlsplit(wb_url.url)
        if not url_parts.path:
            scheme, netloc, path, query, frag = url_parts
            path = '/'
            url = urlunsplit((scheme, netloc, path, query, frag))
            resp = WbResponse.redir_response(urlrewriter.rewrite(url),
                                             '307 Temporary Redirect')

            if self.enable_memento:
                resp.status_headers['Link'] = MementoUtils.make_link(url, 'original')

            return resp

        self.unrewrite_referrer(environ, full_prefix)

        urlkey = canonicalize(wb_url.url)

        if self.use_js_obj_proxy:
            content_rw = self.js_proxy_rw
        else:
            content_rw = self.default_rw

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)

        inputreq.include_method_query(wb_url.url)

        range_start, range_end, skip_record = self._check_range(inputreq, wb_url)

        setcookie_headers = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key)
            inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip_record)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
                r.raw.close()
            except:
                pass

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            raise UpstreamException(r.status_code, url=wb_url.url, details=details)

        stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
        record = self.loader.parse_record_stream(stream,
                                                 ensure_http_headers=True)

        memento_dt = r.headers.get('Memento-Datetime')
        target_uri = r.headers.get('WARC-Target-URI')

        cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))

        #cdx['urlkey'] = urlkey
        #cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        #cdx['url'] = target_uri

        set_content_loc = False

        # Check if Fuzzy Match
        if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
            set_content_loc = True

        # if redir to exact, redir if url or ts are different
        if self.redirect_to_exact:
            if (set_content_loc or
                (wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))):

                new_url = urlrewriter.get_new_url(url=target_uri,
                                                  timestamp=cdx['timestamp'],
                                                  mod=wb_url.mod)

                resp = WbResponse.redir_response(new_url, '307 Temporary Redirect')
                if self.enable_memento:
                    if is_timegate and not is_proxy:
                        self._add_memento_links(target_uri, full_prefix,
                                                memento_dt, cdx['timestamp'],
                                                resp.status_headers,
                                                is_timegate, is_proxy)

                    else:
                        resp.status_headers['Link'] = MementoUtils.make_link(target_uri, 'original')

                return resp

        self._add_custom_params(cdx, r.headers, kwargs)

        if self._add_range(record, wb_url, range_start, range_end):
            wb_url.mod = 'id_'

        is_ajax = self.is_ajax(environ)

        if is_ajax:
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.
                                    create_insert_func(wb_url,
                                                       full_prefix,
                                                       host_prefix,
                                                       top_url,
                                                       environ,
                                                       framed_replay,
                                                       config=self.config))

        cookie_rewriter = None
        if self.cookie_tracker:
            cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
                                                               cookie_key)

        urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')

        result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx)

        status_headers, gen, is_rw = result

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        if ' ' not in status_headers.statusline:
            status_headers.statusline += ' None'

        if not is_ajax and self.enable_memento:
            self._add_memento_links(cdx['url'], full_prefix,
                                    memento_dt, cdx['timestamp'], status_headers,
                                    is_timegate, is_proxy, cdx.get('source-coll'))

            set_content_loc = True

        if set_content_loc and not self.redirect_to_exact:
            status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'],
                                                                                       url=cdx['url'])))
        if not is_proxy:
            self.add_csp_header(wb_url, status_headers)

        response = WbResponse(status_headers, gen)

        return response
Example #46
0
    def render_content(self, wb_url, kwargs, environ):
        wb_url = wb_url.replace('#', '%23')
        wb_url = WbUrl(wb_url)

        history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '')
        if history_page:
            wb_url.url = history_page
            is_ajax = True
        else:
            is_ajax = self.is_ajax(environ)

        is_timegate = self._check_accept_dt(wb_url, environ)

        self.prepare_env(environ)

        host_prefix = environ['pywb.host_prefix']
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix

        pywb_static_prefix = environ['pywb.static_prefix'] + '/'
        is_proxy = ('wsgiprox.proxy_host' in environ)

        # if OPTIONS in proxy mode, just generate the proxy responss
        if is_proxy and self.is_preflight(environ):
            return WbResponse.options_response(environ)

        if self.use_js_obj_proxy:
            content_rw = self.js_proxy_rw
        else:
            content_rw = self.default_rw

        # no redirects if in proxy
        redirect_to_exact = self.redirect_to_exact and not is_proxy

        # Check Prefer
        pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ,
                                                      content_rw, is_proxy)

        response = None
        keep_frame_response = False

        # prefer overrides custom response?
        if pref_mod is not None:
            # fast-redirect to preferred
            if redirect_to_exact and not is_timegate and pref_mod != wb_url.mod:
                new_url = full_prefix + wb_url.to_str(mod=pref_mod)
                headers = [('Preference-Applied', pref_applied),
                           ('Vary', 'Prefer')]

                return WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect',
                                                 headers=headers)
            else:
                wb_url.mod = pref_mod
        else:
            if kwargs.get('output'):
                response = self.handle_timemap(wb_url, kwargs, full_prefix)

            elif wb_url.is_query():
                response = self.handle_query(environ, wb_url, kwargs,
                                             full_prefix)

            else:
                response = self.handle_custom_response(environ, wb_url,
                                                       full_prefix,
                                                       host_prefix, kwargs)

                keep_frame_response = (not kwargs.get('no_timegate_check')
                                       and is_timegate
                                       and not is_proxy) or redirect_to_exact

        if response and not keep_frame_response:
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy)

        if is_proxy:
            environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
            urlrewriter = IdentityUrlRewriter(wb_url, '')
            framed_replay = False

        else:
            urlrewriter = UrlRewriter(wb_url,
                                      prefix=full_prefix,
                                      full_prefix=full_prefix,
                                      rel_prefix=rel_prefix,
                                      pywb_static_prefix=pywb_static_prefix)

            framed_replay = self.framed_replay

        url_parts = urlsplit(wb_url.url)
        if not url_parts.path:
            return self.send_redirect('/', url_parts, urlrewriter)

        self.unrewrite_referrer(environ, full_prefix)

        urlkey = canonicalize(wb_url.url)

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)

        inputreq.include_method_query(wb_url.url)

        range_start, range_end, skip_record = self._check_range(
            inputreq, wb_url)

        setcookie_headers = None
        cookie_key = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            if cookie_key:
                res = self.cookie_tracker.get_cookie_headers(
                    wb_url.url, urlrewriter, cookie_key,
                    environ.get('HTTP_COOKIE', ''))
                inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip_record)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
            except Exception:
                pass
            finally:
                no_except_close(r.raw)

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            if r.status_code == 404:
                raise NotFoundException(url=wb_url.url, msg=details)

            else:
                raise UpstreamException(r.status_code,
                                        url=wb_url.url,
                                        details=details)

        cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))

        cdx_url_parts = urlsplit(cdx['url'])

        if cdx_url_parts.path.endswith(
                '/') and not url_parts.path.endswith('/'):
            # add trailing slash
            new_path = url_parts.path + '/'

            no_except_close(r.raw)

            return self.send_redirect(new_path, url_parts, urlrewriter)

        # only redirect to exact if not live, otherwise set to false
        redirect_to_exact = redirect_to_exact and not cdx.get('is_live')

        # return top-frame timegate response, with timestamp from cdx
        if response and keep_frame_response and (not redirect_to_exact
                                                 or not is_timegate):
            no_except_close(r.raw)
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy,
                                        cdx['timestamp'])

        stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
        record = self.loader.parse_record_stream(stream,
                                                 ensure_http_headers=True)

        memento_dt = r.headers.get('Memento-Datetime')
        target_uri = r.headers.get('WARC-Target-URI')

        # cdx['urlkey'] = urlkey
        # cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        # cdx['url'] = target_uri

        set_content_loc = False

        # Check if Fuzzy Match
        if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
            set_content_loc = True

        # if redirect to exact timestamp (only set if not live)
        if redirect_to_exact:
            if set_content_loc or is_timegate or wb_url.timestamp != cdx.get(
                    'timestamp'):
                new_url = urlrewriter.get_new_url(url=target_uri,
                                                  timestamp=cdx['timestamp'],
                                                  mod=wb_url.mod)

                resp = WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect')
                if self.enable_memento:
                    if is_timegate and not is_proxy:
                        self._add_memento_links(target_uri,
                                                full_prefix,
                                                memento_dt,
                                                cdx['timestamp'],
                                                resp.status_headers,
                                                is_timegate,
                                                is_proxy,
                                                pref_applied=pref_applied,
                                                mod=pref_mod,
                                                is_memento=False)

                    else:
                        resp.status_headers['Link'] = MementoUtils.make_link(
                            target_uri, 'original')

                return resp

        self._add_custom_params(cdx, r.headers, kwargs, record)

        if self._add_range(record, wb_url, range_start, range_end):
            wb_url.mod = 'id_'

        if is_ajax:
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.create_insert_func(
                wb_url,
                full_prefix,
                host_prefix,
                top_url,
                environ,
                framed_replay,
                coll=kwargs.get('coll', ''),
                replay_mod=self.replay_mod,
                metadata=kwargs.get('metadata', {}),
                config=self.config))

        cookie_rewriter = None
        if self.cookie_tracker and cookie_key:
            # skip add cookie if service worker is not 200
            # it seems cookie headers from service workers are not applied, so don't update in cache
            if wb_url.mod == 'sw_':
                cookie_key = None

            cookie_rewriter = self.cookie_tracker.get_rewriter(
                urlrewriter, cookie_key)

        urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')

        result = content_rw(record, urlrewriter, cookie_rewriter,
                            head_insert_func, cdx, environ)

        status_headers, gen, is_rw = result

        if history_page:
            title = DefaultRewriter._extract_title(gen)
            if not title:
                title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', ''))

            if not title:
                title = history_page

            self._add_history_page(cdx, kwargs, title)
            return WbResponse.json_response({'title': title})

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        if ' ' not in status_headers.statusline:
            status_headers.statusline += ' None'

        if not is_ajax and self.enable_memento:
            self._add_memento_links(cdx['url'],
                                    full_prefix,
                                    memento_dt,
                                    cdx['timestamp'],
                                    status_headers,
                                    is_timegate,
                                    is_proxy,
                                    cdx.get('source-coll'),
                                    mod=pref_mod,
                                    pref_applied=pref_applied)

            set_content_loc = True

        if set_content_loc and not redirect_to_exact and not is_proxy:
            status_headers.headers.append(
                ('Content-Location',
                 urlrewriter.get_new_url(timestamp=cdx['timestamp'],
                                         url=cdx['url'])))

        if not is_proxy:
            self.add_csp_header(wb_url, status_headers)

        response = WbResponse(status_headers, gen)

        if is_proxy and environ.get('HTTP_ORIGIN'):
            response.add_access_control_headers(environ)

        if r.status_code == 200 and kwargs.get(
                'cache') == 'always' and environ.get('HTTP_REFERER'):
            response.status_headers[
                'Cache-Control'] = 'public, max-age=31536000, immutable'

        return response