Example #1
0
def make_timemap(wbrequest, cdx_lines):
    prefix = wbrequest.wb_prefix
    url = wbrequest.wb_url.url

    # get first memento as it'll be used for 'from' field
    first_cdx = cdx_lines.next()
    from_date = timestamp_to_http_date(first_cdx['timestamp'])

    # timemap link
    timemap = ('<{0}>; rel="self"; ' +
               'type="application/link-format"; from="{1}",\n')
    yield timemap.format(prefix + wbrequest.wb_url.to_str(), from_date)

    # original link
    original = '<{0}>; rel="original",\n'
    yield original.format(url)

    # timegate link
    timegate = '<{0}>; rel="timegate",\n'
    yield timegate.format(prefix + url)

    # first memento link
    yield make_timemap_memento_link(first_cdx, prefix, datetime=from_date)

    prev_cdx = None

    for cdx in cdx_lines:
        if prev_cdx:
            yield make_timemap_memento_link(prev_cdx, prefix)

        prev_cdx = cdx

    # last memento link, if any
    if prev_cdx:
        yield make_timemap_memento_link(prev_cdx, prefix, end='')
Example #2
0
    def make_timemap_memento_link(cdx, datetime=None, rel='memento', end=',\n'):
        url = cdx.get('load_url')
        if not url:
            url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length'))

        memento = '<{0}>; rel="{1}"; datetime="{2}"; src="{3}"' + end

        if not datetime:
            datetime = timestamp_to_http_date(cdx['timestamp'])

        return memento.format(url, rel, datetime, cdx.get('source', ''))
Example #3
0
def make_timemap(wbrequest, cdx_lines):
    prefix = wbrequest.wb_prefix
    url = wbrequest.wb_url.url
    mod = wbrequest.options.get('replay_mod', '')

    # get first memento as it'll be used for 'from' field
    try:
        first_cdx = six.next(cdx_lines)
        from_date = timestamp_to_http_date(first_cdx['timestamp'])
    except StopIteration:
        first_cdx = None


    if first_cdx:
        # timemap link
        timemap = ('<{0}>; rel="self"; ' +
                   'type="application/link-format"; from="{1}",\n')
        yield timemap.format(prefix + wbrequest.wb_url.to_str(),
                             from_date)

    # original link
    original = '<{0}>; rel="original",\n'
    yield original.format(url)

    # timegate link
    timegate = '<{0}>; rel="timegate",\n'
    timegate_url= WbUrl.to_wburl_str(url=url,
                                     mod=mod,
                                     type=WbUrl.LATEST_REPLAY)

    yield timegate.format(prefix + timegate_url)

    if not first_cdx:
        # terminating timemap link, no from
        timemap = ('<{0}>; rel="self"; type="application/link-format"')
        yield timemap.format(prefix + wbrequest.wb_url.to_str())
        return

    # first memento link
    yield make_timemap_memento_link(first_cdx, prefix,
                            datetime=from_date, mod=mod)

    prev_cdx = None

    for cdx in cdx_lines:
        if prev_cdx:
            yield make_timemap_memento_link(prev_cdx, prefix, mod=mod)

        prev_cdx = cdx

    # last memento link, if any
    if prev_cdx:
        yield make_timemap_memento_link(prev_cdx, prefix, end='', mod=mod)
Example #4
0
def make_timemap(wbrequest, cdx_lines):
    prefix = wbrequest.wb_prefix
    url = wbrequest.wb_url.url
    mod = wbrequest.options.get('replay_mod', '')

    # get first memento as it'll be used for 'from' field
    try:
        first_cdx = six.next(cdx_lines)
        from_date = timestamp_to_http_date(first_cdx['timestamp'])
    except StopIteration:
        first_cdx = None

    if first_cdx:
        # timemap link
        timemap = ('<{0}>; rel="self"; ' +
                   'type="application/link-format"; from="{1}",\n')
        yield timemap.format(prefix + wbrequest.wb_url.to_str(), from_date)

    # original link
    original = '<{0}>; rel="original",\n'
    yield original.format(url)

    # timegate link
    timegate = '<{0}>; rel="timegate",\n'
    timegate_url = WbUrl.to_wburl_str(url=url,
                                      mod=mod,
                                      type=WbUrl.LATEST_REPLAY)

    yield timegate.format(prefix + timegate_url)

    if not first_cdx:
        # terminating timemap link, no from
        timemap = ('<{0}>; rel="self"; type="application/link-format"')
        yield timemap.format(prefix + wbrequest.wb_url.to_str())
        return

    # first memento link
    yield make_timemap_memento_link(first_cdx,
                                    prefix,
                                    datetime=from_date,
                                    mod=mod)

    prev_cdx = None

    for cdx in cdx_lines:
        if prev_cdx:
            yield make_timemap_memento_link(prev_cdx, prefix, mod=mod)

        prev_cdx = cdx

    # last memento link, if any
    if prev_cdx:
        yield make_timemap_memento_link(prev_cdx, prefix, end='', mod=mod)
Example #5
0
def make_memento_link(cdx, prefix, datetime=None, rel='memento', end=',\n'):
    memento = '<{0}>; rel="{1}"; datetime="{2}"' + end

    string = WbUrl.to_wburl_str(url=cdx['original'],
                                mod='',
                                timestamp=cdx['timestamp'],
                                type=WbUrl.REPLAY)

    url = prefix + string

    if not datetime:
        datetime = timestamp_to_http_date(cdx['timestamp'])

    return memento.format(url, rel, datetime)
Example #6
0
def make_memento_link(cdx, prefix, datetime=None, rel='memento', end=',\n'):
    memento = '<{0}>; rel="{1}"; datetime="{2}"' + end

    string = WbUrl.to_wburl_str(url=cdx['original'],
                                mod='',
                                timestamp=cdx['timestamp'],
                                type=WbUrl.REPLAY)

    url = prefix + string

    if not datetime:
        datetime = timestamp_to_http_date(cdx['timestamp'])

    return memento.format(url, rel, datetime)
Example #7
0
    def _do_req(self, urls, host, cdx, env, skip_hosts):
        response = None

        headers = {}
        user_agent = env.get('HTTP_USER_AGENT', '')

        # disable gzip, as mosaic won't support it!
        # TODO: maybe ungzip later
        if any(exclude in user_agent for exclude in NO_GZIP_UAS):
            headers['Accept-Encoding'] = 'identity'

        # needed to avoid interstitial in openwayback
        headers['Accept-Datetime'] = timestamp_to_http_date(cdx['timestamp'])

        headers['User-Agent'] = self.user_agent.format(user_agent)

        for url in urls:
            if self.reverse_proxy_prefix:
                url = self.reverse_proxy_prefix + url

            response = self.session.request(method='GET',
                                            url=url,
                                            allow_redirects=False,
                                            headers=headers,
                                            stream=True,
                                            verify=False)

            if response is None:
                continue

            mem_date_time = response.headers.get('memento-datetime')

            if (response.status_code >= 400 and not mem_date_time):
                if response.status_code == 403 or response.status_code >= 500:
                    # skip host
                    skip_hosts.append(host)

                # try again with diff memento
                return None

            # success
            return response

        return response
Example #8
0
    def _do_req(self, urls, host, cdx, env, skip_hosts):
        response = None

        headers = {}
        user_agent = env.get('HTTP_USER_AGENT', '')

        # disable gzip, as mosaic won't support it!
        # TODO: maybe ungzip later
        if any(exclude in user_agent for exclude in NO_GZIP_UAS):
            headers['Accept-Encoding'] = 'identity'

        # needed to avoid interstitial in openwayback
        headers['Accept-Datetime'] = timestamp_to_http_date(cdx['timestamp'])

        headers['User-Agent'] = self.user_agent.format(user_agent)

        for url in urls:
            if self.reverse_proxy_prefix:
                url = self.reverse_proxy_prefix + url

            response = self.session.request(method='GET',
                                            url=url,
                                            allow_redirects=False,
                                            headers=headers,
                                            stream=True,
                                            verify=False)

            if response is None:
                continue

            mem_date_time = response.headers.get('memento-datetime')

            if (response.status_code >= 400 and not mem_date_time):
                if response.status_code == 403 or response.status_code >= 500:
                    # skip host
                    skip_hosts.append(host)

                # try again with diff memento
                return None

            # success
            return response

        return response
Example #9
0
    def make_timemap(cdx_iter):
        # get first memento as it'll be used for 'from' field
        try:
            first_cdx = six.next(cdx_iter)
            from_date = timestamp_to_http_date(first_cdx['timestamp'])
        except StopIteration:
            first_cdx = None
            return

        # first memento link
        yield MementoUtils.make_timemap_memento_link(first_cdx, datetime=from_date)

        prev_cdx = None

        for cdx in cdx_iter:
            if prev_cdx:
                yield MementoUtils.make_timemap_memento_link(prev_cdx)

            prev_cdx = cdx

        # last memento link, if any
        if prev_cdx:
            yield MementoUtils.make_timemap_memento_link(prev_cdx, end='\n')
Example #10
0
def make_timemap(wbrequest, cdx_lines):
    prefix = wbrequest.wb_prefix
    url = wbrequest.wb_url.url

    # get first memento as it'll be used for 'from' field
    first_cdx = cdx_lines.next()
    from_date = timestamp_to_http_date(first_cdx['timestamp'])

    # timemap link
    timemap = ('<{0}>; rel="self"; ' +
               'type="application/link-format"; from="{1}",\n')
    yield timemap.format(prefix + wbrequest.wb_url.to_str(),
                         from_date)

    # original link
    original = '<{0}>; rel="original",\n'
    yield original.format(url)

    # timegate link
    timegate = '<{0}>; rel="timegate",\n'
    yield timegate.format(prefix + url)

    # first memento link
    yield make_memento_link(first_cdx, prefix,
                            datetime=from_date)

    prev_cdx = None

    for cdx in cdx_lines:
        if prev_cdx:
            yield make_memento_link(prev_cdx, prefix)

        prev_cdx = cdx

    # last memento link, if any
    if prev_cdx:
        yield make_memento_link(prev_cdx, prefix, end='')
Example #11
0
    def _init_derived(self, params):
        wbrequest = params.get('wbrequest')
        is_redirect = params.get('memento_is_redir', False)
        cdx = params.get('cdx')

        if not wbrequest or not wbrequest.wb_url:
            return

        mod = wbrequest.options.get('replay_mod', '')

        #is_top_frame = wbrequest.wb_url.is_top_frame
        is_top_frame = wbrequest.options.get('is_top_frame', False)

        is_timegate = (wbrequest.options.get('is_timegate', False) and
                       not is_top_frame)

        if is_timegate:
            self.status_headers.replace_header('Vary', 'accept-datetime')

        # Determine if memento:
        is_memento = False
        is_original = False

        # if no cdx included, not a memento, unless top-frame special
        if not cdx:
            # special case: include the headers but except Memento-Datetime
            # since this is really an intermediate resource
            if is_top_frame:
                is_memento = True

        # otherwise, if in proxy mode, then always a memento
        elif wbrequest.options['is_proxy']:
            is_memento = True
            is_original = True

        # otherwise only if timestamp replay (and not a timegate)
        #elif not is_timegate:
        #    is_memento = (wbrequest.wb_url.type == wbrequest.wb_url.REPLAY)
        elif not is_redirect:
            is_memento = (wbrequest.wb_url.is_replay())

        link = []
        req_url = wbrequest.wb_url.url

        if is_memento or is_timegate:
            url = req_url
            if cdx:
                ts = cdx['timestamp']
                url = cdx['url']
            # for top frame
            elif wbrequest.wb_url.timestamp:
                ts = wbrequest.wb_url.timestamp
            else:
                ts = None

            if ts:
                http_date = timestamp_to_http_date(ts)

                if is_memento:
                    self.status_headers.replace_header('Memento-Datetime',
                                                       http_date)

                canon_link = wbrequest.urlrewriter.get_new_url(mod=mod,
                                                               timestamp=ts,
                                                               url=url)

                # set in replay_views -- Must set content location
                #if is_memento and is_timegate:
                #    self.status_headers.headers.append(('Content-Location',
                #                                        canon_link))

                # don't set memento link for very long urls...
                if len(canon_link) < 512:
                    link.append(self.make_memento_link(canon_link,
                                                       'memento',
                                                       http_date))

        if is_original and is_timegate:
            link.append(self.make_link(req_url, 'original timegate'))
        else:
            link.append(self.make_link(req_url, 'original'))

        # for now, include timemap only in non-proxy mode
        if not wbrequest.options['is_proxy'] and (is_memento or is_timegate):
            link.append(self.make_timemap_link(wbrequest))

        if is_memento and not is_timegate:
            timegate = wbrequest.urlrewriter.get_new_url(mod=mod, timestamp='')
            link.append(self.make_link(timegate, 'timegate'))

        link = ', '.join(link)

        self.status_headers.replace_header('Link', link)
Example #12
0
    def _init_derived(self, params):
        wbrequest = params.get('wbrequest')
        cdx = params.get('cdx')

        if not wbrequest or not wbrequest.wb_url:
            return

        is_top_frame = wbrequest.wb_url.is_top_frame

        is_timegate = (wbrequest.options.get('is_timegate', False) and
                       not is_top_frame)

        if is_timegate:
            self.status_headers.headers.append(('Vary', 'accept-datetime'))

        # Determine if memento:
        is_memento = False

        # if no cdx included, not a memento, unless top-frame special
        if not cdx:
            # special case: include the headers but except Memento-Datetime
            # since this is really an intermediate resource
            if is_top_frame:
                is_memento = True

        # otherwise, if in proxy mode, then always a memento
        elif wbrequest.options['is_proxy']:
            is_memento = True

        # otherwise only for replay
        else:
            is_memento = (wbrequest.wb_url.type == wbrequest.wb_url.REPLAY)

        link = []

        if is_memento and cdx:
            http_date = timestamp_to_http_date(cdx['timestamp'])
            self.status_headers.headers.append(('Memento-Datetime', http_date))

        elif is_memento and is_top_frame and wbrequest.wb_url.timestamp:
            # top frame special case
            canon_link = wbrequest.urlrewriter.get_new_url(mod='')
            link.append(self.make_link(canon_link, 'memento'))

        req_url = wbrequest.wb_url.url

        if is_memento and is_timegate:
            link.append(self.make_link(req_url, 'original timegate'))
        else:
            link.append(self.make_link(req_url, 'original'))

        # for now, include timemap only in non-proxy mode
        if not wbrequest.options['is_proxy'] and (is_memento or is_timegate):
            link.append(self.make_timemap_link(wbrequest))

        if is_memento and not is_timegate:
            timegate = wbrequest.urlrewriter.get_new_url(mod='', timestamp='')
            link.append(self.make_link(timegate, 'timegate'))

        link = ', '.join(link)

        self.status_headers.headers.append(('Link', link))
Example #13
0
    def _init_derived(self, params):
        wbrequest = params.get('wbrequest')
        cdx = params.get('cdx')

        if not wbrequest or not wbrequest.wb_url:
            return

        mod = wbrequest.options.get('replay_mod', '')

        #is_top_frame = wbrequest.wb_url.is_top_frame
        is_top_frame = wbrequest.options.get('is_top_frame')

        is_timegate = (wbrequest.options.get('is_timegate', False)
                       and not is_top_frame)

        if is_timegate:
            self.status_headers.headers.append(('Vary', 'accept-datetime'))

        # Determine if memento:
        is_memento = False

        # if no cdx included, not a memento, unless top-frame special
        if not cdx:
            # special case: include the headers but except Memento-Datetime
            # since this is really an intermediate resource
            if is_top_frame:
                is_memento = True

        # otherwise, if in proxy mode, then always a memento
        elif wbrequest.options['is_proxy']:
            is_memento = True

        # otherwise only if timestamp replay (and not a timegate)
        elif not is_timegate:
            is_memento = (wbrequest.wb_url.type == wbrequest.wb_url.REPLAY)

        link = []
        req_url = wbrequest.wb_url.url

        if is_memento or is_timegate:
            url = req_url
            if cdx:
                ts = cdx['timestamp']
                url = cdx['url']
            # for top frame
            elif wbrequest.wb_url.timestamp:
                ts = wbrequest.wb_url.timestamp
            else:
                ts = None

            if ts:
                http_date = timestamp_to_http_date(ts)

                if is_memento:
                    self.status_headers.headers.append(
                        ('Memento-Datetime', http_date))

                canon_link = wbrequest.urlrewriter.get_new_url(mod=mod,
                                                               timestamp=ts,
                                                               url=url)

                link.append(
                    self.make_memento_link(canon_link, 'memento', http_date))

        if is_memento and is_timegate:
            link.append(self.make_link(req_url, 'original timegate'))
        else:
            link.append(self.make_link(req_url, 'original'))

        # for now, include timemap only in non-proxy mode
        if not wbrequest.options['is_proxy'] and (is_memento or is_timegate):
            link.append(self.make_timemap_link(wbrequest))

        if is_memento and not is_timegate:
            timegate = wbrequest.urlrewriter.get_new_url(mod=mod, timestamp='')
            link.append(self.make_link(timegate, 'timegate'))

        link = ', '.join(link)

        self.status_headers.headers.append(('Link', link))