Example #1
0
    def load_resource(self, cdx, params):
        if cdx.get('_cached_result'):
            return cdx.get('_cached_result')

        if not cdx.get('filename') or cdx.get('offset') is None:
            return None

        orig_source = cdx.get('source', '').split(':')[0]
        formatter = ParamFormatter(params, orig_source)
        cdx._formatter = formatter

        def local_index_query(local_params):
            for n, v in six.iteritems(params):
                if n.startswith('param.'):
                    local_params[n] = v

            cdx_iter, errs = self.cdx_source(local_params)
            for cdx in cdx_iter:
                cdx._formatter = formatter
                yield cdx

        failed_files = []
        headers, payload = (self.resolve_loader.load_headers_and_payload(
            cdx, failed_files, local_index_query))

        http_headers_buff = None
        if payload.rec_type in ('response', 'revisit'):
            status = cdx.get('status')
            # status may not be set for 'revisit'
            if not status or status.startswith('3'):
                http_headers = self.headers_parser.parse(payload.raw_stream)
                self.raise_on_self_redirect(
                    params, cdx, http_headers.get_statuscode(),
                    http_headers.get_header('Location'))

                http_headers_buff = http_headers.to_bytes()

        warc_headers = payload.rec_headers

        if headers != payload:
            warc_headers.replace_header(
                'WARC-Refers-To-Target-URI',
                payload.rec_headers.get_header('WARC-Target-URI'))

            warc_headers.replace_header(
                'WARC-Refers-To-Date',
                payload.rec_headers.get_header('WARC-Date'))

            warc_headers.replace_header(
                'WARC-Target-URI',
                headers.rec_headers.get_header('WARC-Target-URI'))

            warc_headers.replace_header(
                'WARC-Date', headers.rec_headers.get_header('WARC-Date'))

            headers.raw_stream.close()

        return (warc_headers, http_headers_buff, payload.raw_stream)
Example #2
0
    def load_child_source(self, name, source, params):
        try:
            params['_name'] = name
            params['_formatter'] = ParamFormatter(params, name)
            res = source.load_index(params)
            if isinstance(res, tuple):
                cdx_iter, err_list = res
            else:
                cdx_iter = res
                err_list = []
        except WbException as wbe:
            #print('Not found in ' + name)
            cdx_iter = iter([])
            err_list = [(name, repr(wbe))]

        def add_source(cdx, name):
            if not cdx.get('url'):
                return cdx

            if cdx.get('source'):
                cdx['source'] = name + ':' + cdx['source']
            else:
                cdx['source'] = name

            cdx['source-coll'] = self._get_coll(name)

            return cdx

        if params.get('nosource') != 'true':
            src_coll = params.get('param.' + name + '.src_coll')
            if src_coll:
                name += ':' + src_coll

            cdx_iter = (add_source(cdx, name) for cdx in cdx_iter)

        return cdx_iter, err_list
Example #3
0
 def _get_params(self, environ):
     params = dict(parse_qsl(environ.get('QUERY_STRING')))
     params['_formatter'] = ParamFormatter(params,
                                           name=self.rec_source_name)
     return params
Example #4
0
    def load_resource(self, cdx, params):
        if cdx.get('_cached_result'):
            return cdx.get('_cached_result')

        if not cdx.get('filename') or cdx.get('offset') is None:
            return None

        orig_source = cdx.get('source', '').split(':')[0]
        formatter = ParamFormatter(params, orig_source)
        cdx._formatter = formatter

        def local_index_query(local_params):
            for n, v in six.iteritems(params):
                if n.startswith('param.'):
                    local_params[n] = v

            cdx_iter, errs = self.cdx_source(local_params)
            for cdx in cdx_iter:
                cdx._formatter = formatter
                yield cdx

        failed_files = []
        headers, payload = (self.resolve_loader.load_headers_and_payload(
            cdx, failed_files, local_index_query))

        http_headers_buff = None
        if payload.rec_type in ('response', 'revisit'):
            status = cdx.get('status')

            # if status is not set and not, 2xx, 4xx, 5xx
            # go through self-redirect check just in case
            if not status or not status.startswith(('2', '4', '5')):
                http_headers = self.headers_parser.parse(payload.raw_stream)
                try:
                    orig_size = payload.raw_stream.tell()
                except:
                    orig_size = 0

                try:
                    self.raise_on_self_redirect(
                        params, cdx, http_headers.get_statuscode(),
                        http_headers.get_header('Location'))
                except LiveResourceException:
                    no_except_close(headers.raw_stream)
                    no_except_close(payload.raw_stream)
                    raise

                http_headers_buff = http_headers.to_bytes()

                # if new http_headers_buff is different length,
                # attempt to adjust content-length on the WARC record
                if orig_size and len(http_headers_buff) != orig_size:
                    orig_cl = payload.rec_headers.get_header('Content-Length')
                    if orig_cl:
                        new_cl = int(orig_cl) + (len(http_headers_buff) -
                                                 orig_size)
                        payload.rec_headers.replace_header(
                            'Content-Length', str(new_cl))

        warc_headers = payload.rec_headers

        if headers != payload:
            warc_headers.replace_header(
                'WARC-Refers-To-Target-URI',
                payload.rec_headers.get_header('WARC-Target-URI'))

            warc_headers.replace_header(
                'WARC-Refers-To-Date',
                payload.rec_headers.get_header('WARC-Date'))

            warc_headers.replace_header(
                'WARC-Target-URI',
                headers.rec_headers.get_header('WARC-Target-URI'))

            warc_headers.replace_header(
                'WARC-Date', headers.rec_headers.get_header('WARC-Date'))
            no_except_close(headers.raw_stream)

        return (warc_headers, http_headers_buff, payload.raw_stream)