Ejemplo n.º 1
0
class ArcWarcRecordLoader(object):
    WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']

    HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1']

    HTTP_VERBS = [
        'GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE', 'OPTIONS', 'CONNECT',
        'PATCH'
    ]

    NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource')

    NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
    HTTP_SCHEMES = ('http:', 'https:')

    def __init__(self, verify_http=True, arc2warc=True):
        if arc2warc:
            self.arc_parser = ARC2WARCHeadersParser()
        else:
            self.arc_parser = ARCHeadersParser()

        self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
        self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)

        self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS,
                                                      verify_http)

    def parse_record_stream(self,
                            stream,
                            statusline=None,
                            known_format=None,
                            no_record_parse=False):
        """ Parse file-like stream and return an ArcWarcRecord
        encapsulating the record headers, http headers (if any),
        and a stream limited to the remainder of the record.

        Pass statusline and known_format to detect_type_loader_headers()
        to faciliate parsing.
        """
        (the_format, rec_headers) = (self._detect_type_load_headers(
            stream, statusline, known_format))

        if the_format == 'arc':
            uri = rec_headers.get_header('uri')
            length = rec_headers.get_header('length')
            content_type = rec_headers.get_header('content-type')
            sub_len = rec_headers.total_len
            if uri and uri.startswith('filedesc://'):
                rec_type = 'arc_header'
            else:
                rec_type = 'response'

        elif the_format in ('warc', 'arc2warc'):
            rec_type = rec_headers.get_header('WARC-Type')
            uri = rec_headers.get_header('WARC-Target-URI')
            length = rec_headers.get_header('Content-Length')
            content_type = rec_headers.get_header('Content-Type')
            if the_format == 'warc':
                sub_len = 0
            else:
                sub_len = rec_headers.total_len
                the_format = 'warc'

        is_err = False

        try:
            if length is not None:
                length = int(length) - sub_len
                if length < 0:
                    is_err = True

        except (ValueError, TypeError):
            is_err = True

        # err condition
        if is_err:
            length = 0

        # limit stream to the length for all valid records
        if length is not None and length >= 0:
            stream = LimitReader.wrap_stream(stream, length)

        # don't parse the http record at all
        if no_record_parse:
            http_headers = None  #StatusAndHeaders('', [])

        # if empty record (error or otherwise) set status to 204
        elif length == 0:
            #if is_err:
            #    msg = '204 Possible Error'
            #else:
            #    msg = '204 No Content'
            http_headers = StatusAndHeaders('', [])

        # response record or non-empty revisit: parse HTTP status and headers!
        elif (rec_type in ('response', 'revisit')
              and uri.startswith(self.HTTP_SCHEMES)):
            http_headers = self.http_parser.parse(stream)

        # request record: parse request
        elif ((rec_type == 'request') and uri.startswith(self.HTTP_SCHEMES)):
            http_headers = self.http_req_parser.parse(stream)

        # everything else: create a no-status entry, set content-type
        else:
            content_type_header = [('Content-Type', content_type)]

            if length is not None and length >= 0:
                content_type_header.append(('Content-Length', str(length)))

            http_headers = StatusAndHeaders('200 OK', content_type_header)

        return ArcWarcRecord(the_format, rec_type, rec_headers, stream,
                             http_headers, content_type, length)

    def _detect_type_load_headers(self,
                                  stream,
                                  statusline=None,
                                  known_format=None):
        """ If known_format is specified ('warc' or 'arc'),
        parse only as that format.

        Otherwise, try parsing record as WARC, then try parsing as ARC.
        if neither one succeeds, we're out of luck.
        """

        if known_format != 'arc':
            # try as warc first
            try:
                rec_headers = self.warc_parser.parse(stream, statusline)
                return 'warc', rec_headers
            except StatusAndHeadersParserException as se:
                if known_format == 'warc':
                    msg = 'Invalid WARC record, first line: '
                    raise ArchiveLoadFailed(msg + str(se.statusline))

                statusline = se.statusline
                pass

        # now try as arc
        try:
            rec_headers = self.arc_parser.parse(stream, statusline)
            return self.arc_parser.get_rec_type(), rec_headers
        except StatusAndHeadersParserException as se:
            if known_format == 'arc':
                msg = 'Invalid ARC record, first line: '
            else:
                msg = 'Unknown archive format, first line: '
            raise ArchiveLoadFailed(msg + str(se.statusline))
Ejemplo n.º 2
0
class ArcWarcRecordLoader(object):
    WARC_TYPES = ['WARC/1.1', 'WARC/1.0', 'WARC/0.17', 'WARC/0.18']

    HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1']

    HTTP_VERBS = [
        'GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE', 'OPTIONS', 'CONNECT',
        'PATCH'
    ]

    HTTP_RECORDS = ('response', 'request', 'revisit')

    NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
    HTTP_SCHEMES = ('http:', 'https:')

    def __init__(self, verify_http=True, arc2warc=True):
        if arc2warc:
            self.arc_parser = ARC2WARCHeadersParser()
        else:
            self.arc_parser = ARCHeadersParser()

        self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
        self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)

        self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS,
                                                      verify_http)

    def parse_record_stream(self,
                            stream,
                            statusline=None,
                            known_format=None,
                            no_record_parse=False,
                            ensure_http_headers=False):
        """ Parse file-like stream and return an ArcWarcRecord
        encapsulating the record headers, http headers (if any),
        and a stream limited to the remainder of the record.

        Pass statusline and known_format to detect_type_loader_headers()
        to faciliate parsing.
        """
        (the_format, rec_headers) = (self._detect_type_load_headers(
            stream, statusline, known_format))

        if the_format == 'arc':
            uri = rec_headers.get_header('uri')
            length = rec_headers.get_header('length')
            content_type = rec_headers.get_header('content-type')
            sub_len = rec_headers.total_len
            if uri and uri.startswith('filedesc://'):
                rec_type = 'arc_header'
            else:
                rec_type = 'response'

        elif the_format in ('warc', 'arc2warc'):
            rec_type = rec_headers.get_header('WARC-Type')
            uri = self._ensure_target_uri_format(rec_headers)
            length = rec_headers.get_header('Content-Length')
            content_type = rec_headers.get_header('Content-Type')
            if the_format == 'warc':
                sub_len = 0
            else:
                sub_len = rec_headers.total_len
                the_format = 'warc'

        is_err = False

        try:
            if length is not None:
                length = int(length) - sub_len
                if length < 0:
                    is_err = True

        except (ValueError, TypeError):
            is_err = True

        # err condition
        if is_err:
            length = 0

        # limit stream to the length for all valid records
        if length is not None and length >= 0:
            stream = LimitReader.wrap_stream(stream, length)

        http_headers = None

        # load http headers if parsing
        if not no_record_parse:
            http_headers = self.load_http_headers(rec_type, uri, stream,
                                                  length)

        # generate validate http headers (eg. for replay)
        if not http_headers and ensure_http_headers:
            http_headers = self.default_http_headers(length, content_type)

        return ArcWarcRecord(the_format, rec_type, rec_headers, stream,
                             http_headers, content_type, length)

    def load_http_headers(self, rec_type, uri, stream, length):
        # only if length == 0 don't parse
        # try parsing is length is unknown (length is None) or length > 0
        if length == 0:
            return None

        # only certain record types can have http headers
        if rec_type not in self.HTTP_RECORDS:
            return None

        # only http:/https: uris can have http headers
        if not uri.startswith(self.HTTP_SCHEMES):
            return None

        # request record: parse request
        if rec_type == 'request':
            return self.http_req_parser.parse(stream)

        elif rec_type == 'revisit':
            try:
                return self.http_parser.parse(stream)
            except EOFError:
                # empty revisit with no http headers, is ok!
                return None

        # response record or non-empty revisit: parse HTTP status and headers!
        else:
            return self.http_parser.parse(stream)

    def default_http_headers(self, length, content_type=None):
        headers = []
        if content_type:
            headers.append(('Content-Type', content_type))

        if length is not None and length >= 0:
            headers.append(('Content-Length', str(length)))

        return StatusAndHeaders('200 OK', headers=headers, protocol='HTTP/1.0')

    def _detect_type_load_headers(self,
                                  stream,
                                  statusline=None,
                                  known_format=None):
        """ If known_format is specified ('warc' or 'arc'),
        parse only as that format.

        Otherwise, try parsing record as WARC, then try parsing as ARC.
        if neither one succeeds, we're out of luck.
        """

        if known_format != 'arc':
            # try as warc first
            try:
                rec_headers = self.warc_parser.parse(stream, statusline)
                return 'warc', rec_headers
            except StatusAndHeadersParserException as se:
                if known_format == 'warc':
                    msg = 'Invalid WARC record, first line: '
                    raise ArchiveLoadFailed(msg + str(se.statusline))

                statusline = se.statusline
                pass

        # now try as arc
        try:
            rec_headers = self.arc_parser.parse(stream, statusline)
            return self.arc_parser.get_rec_type(), rec_headers
        except StatusAndHeadersParserException as se:
            if known_format == 'arc':
                msg = 'Invalid ARC record, first line: '
            else:
                msg = 'Unknown archive format, first line: '
            raise ArchiveLoadFailed(msg + str(se.statusline))

    def _ensure_target_uri_format(self, rec_headers):
        """Checks the value for the WARC-Target-URI header field to see if it starts
        with '<' and ends with '>' (Wget 1.19 bug) and if '<' and '>' are present,
        corrects and updates the field returning the corrected value for the field
        otherwise just returns the fields value.

        :param StatusAndHeaders rec_headers: The parsed WARC headers
        :return: The value for the WARC-Target-URI field
        :rtype: str | None
        """
        uri = rec_headers.get_header('WARC-Target-URI')
        if uri is not None and uri.startswith('<') and uri.endswith('>'):
            uri = uri[1:-1]
            rec_headers.replace_header('WARC-Target-URI', uri)

        # BEGIN PERMA CUSTOMIZATION
        # https://github.com/webrecorder/warcio/blob/c64c4394805e13256695f51af072c95389397ee9/warcio/recordloader.py#L217
        # https://github.com/webrecorder/warcio/pull/80
        # don't pass WARC-Target-URI with spaces to the cdxline indexers, which don't expect that
        # cause of at least some of the errors in https://github.com/harvard-lil/perma/issues/2605
        if uri is not None and " " in uri:
            logger.warning(
                "Replacing spaces in invalid WARC-Target-URI: {}".format(uri))
            uri = uri.replace(" ", "%20")
            rec_headers.replace_header('WARC-Target-URI', uri)
        # END PERMA CUSTOMIZATION

        return uri
Ejemplo n.º 3
0
    def __init__(self, env):
        self.env = env

        parser = StatusAndHeadersParser([], verify=False)

        self.status_headers = parser.parse(self.env['wsgi.input'])
Ejemplo n.º 4
0
class WARCPathLoader(DefaultResolverMixin, BaseLoader):
    def __init__(self, paths, cdx_source):
        self.paths = paths

        self.resolvers = self.make_resolvers(self.paths)

        self.resolve_loader = ResolvingLoader(self.resolvers,
                                              no_record_parse=True)

        self.headers_parser = StatusAndHeadersParser([], verify=False)

        self.cdx_source = cdx_source

    def load_resource(self, cdx, params):
        if cdx.get('_cached_result'):
            return cdx.get('_cached_result')

        if not cdx.get('filename') or cdx.get('offset') is None:
            return None

        orig_source = cdx.get('source', '').split(':')[0]
        formatter = ParamFormatter(params, orig_source)
        cdx._formatter = formatter

        def local_index_query(local_params):
            for n, v in six.iteritems(params):
                if n.startswith('param.'):
                    local_params[n] = v

            cdx_iter, errs = self.cdx_source(local_params)
            for cdx in cdx_iter:
                cdx._formatter = formatter
                yield cdx

        failed_files = []
        headers, payload = (self.resolve_loader.load_headers_and_payload(
            cdx, failed_files, local_index_query))

        http_headers_buff = None
        if payload.rec_type in ('response', 'revisit'):
            status = cdx.get('status')
            # status may not be set for 'revisit'
            if not status or status.startswith('3'):
                http_headers = self.headers_parser.parse(payload.raw_stream)

                try:
                    self.raise_on_self_redirect(
                        params, cdx, http_headers.get_statuscode(),
                        http_headers.get_header('Location'))
                except LiveResourceException:
                    headers.raw_stream.close()
                    payload.raw_stream.close()
                    raise

                http_headers_buff = http_headers.to_bytes()

        warc_headers = payload.rec_headers

        if headers != payload:
            warc_headers.replace_header(
                'WARC-Refers-To-Target-URI',
                payload.rec_headers.get_header('WARC-Target-URI'))

            warc_headers.replace_header(
                'WARC-Refers-To-Date',
                payload.rec_headers.get_header('WARC-Date'))

            warc_headers.replace_header(
                'WARC-Target-URI',
                headers.rec_headers.get_header('WARC-Target-URI'))

            warc_headers.replace_header(
                'WARC-Date', headers.rec_headers.get_header('WARC-Date'))

            headers.raw_stream.close()

        return (warc_headers, http_headers_buff, payload.raw_stream)

    def __str__(self):
        return 'WARCPathLoader'
Ejemplo n.º 5
0
class BaseWARCWriter(object):
    WARC_RECORDS = {
        'warcinfo': 'application/warc-fields',
        'response': 'application/http; msgtype=response',
        'revisit': 'application/http; msgtype=response',
        'request': 'application/http; msgtype=request',
        'metadata': 'application/warc-fields',
    }

    REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/identical-payload-digest'

    WARC_VERSION = 'WARC/1.0'

    def __init__(self, gzip=True, *args, **kwargs):
        self.gzip = gzip
        self.hostname = gethostname()

        self.parser = StatusAndHeadersParser([], verify=False)

        self.warc_version = kwargs.get('warc_version', self.WARC_VERSION)
        self.header_filter = kwargs.get('header_filter')

    @classmethod
    def _iter_stream(cls, stream):
        while True:
            buf = stream.read(BUFF_SIZE)
            if not buf:
                return

            yield buf

    def ensure_digest(self, record, block=True, payload=True):
        if block and record.rec_headers.get_header('WARC-Block-Digest'):
            block = False

        if payload and record.rec_headers.get_header('WARC-Payload-Digest'):
            payload = False

        block_digester = self._create_digester() if block else None
        payload_digester = self._create_digester() if payload else None

        if not block_digester and not payload_digester:
            return

        temp_file = None
        try:
            pos = record.raw_stream.tell()
            record.raw_stream.seek(pos)
        except:
            pos = 0
            temp_file = self._create_temp_file()

        if block_digester and record.http_headers and record.http_headers.headers_buff:
            block_digester.update(record.http_headers.headers_buff)

        for buf in self._iter_stream(record.raw_stream):
            if block_digester:
                block_digester.update(buf)

            if payload_digester:
                payload_digester.update(buf)

            if temp_file:
                temp_file.write(buf)

        if temp_file:
            record.payload_length = temp_file.tell()
            temp_file.seek(0)
            record._orig_stream = record.raw_stream
            record.raw_stream = temp_file
        else:
            record.raw_stream.seek(pos)

        if block_digester:
            record.rec_headers.add_header('WARC-Block-Digest',
                                          str(block_digester))

        if payload_digester:
            record.rec_headers.add_header('WARC-Payload-Digest',
                                          str(payload_digester))

    def _create_digester(self):
        return Digester('sha1')

    def write_request_response_pair(self, req, resp, params=None):
        url = resp.rec_headers.get_header('WARC-Target-URI')
        dt = resp.rec_headers.get_header('WARC-Date')

        req.rec_headers.replace_header('WARC-Target-URI', url)
        req.rec_headers.replace_header('WARC-Date', dt)

        resp_id = resp.rec_headers.get_header('WARC-Record-ID')
        if resp_id:
            req.rec_headers.add_header('WARC-Concurrent-To', resp_id)

        self._do_write_req_resp(req, resp, params)

    def write_record(self, record, params=None):  #pragma: no cover
        raise NotImplemented()

    def _do_write_req_resp(self, req, resp, params):  #pragma: no cover
        raise NotImplemented()

    def create_warcinfo_record(self, filename, info):
        warc_headers = StatusAndHeaders(self.warc_version, [])
        warc_headers.add_header('WARC-Type', 'warcinfo')
        warc_headers.add_header('WARC-Record-ID', self._make_warc_id())
        if filename:
            warc_headers.add_header('WARC-Filename', filename)
        warc_headers.add_header('WARC-Date', self._make_warc_date())

        warcinfo = BytesIO()
        for name, value in six.iteritems(info):
            if not value:
                continue

            line = name + ': ' + str(value) + '\r\n'
            warcinfo.write(line.encode('latin-1'))

        length = warcinfo.tell()
        warcinfo.seek(0)

        return self.create_warc_record('',
                                       'warcinfo',
                                       warc_headers=warc_headers,
                                       payload=warcinfo,
                                       length=length)

    def create_revisit_record(self,
                              uri,
                              digest,
                              refers_to_uri,
                              refers_to_date,
                              http_headers=None):

        record = self.create_warc_record(uri,
                                         'revisit',
                                         http_headers=http_headers)

        record.rec_headers.add_header('WARC-Profile', self.REVISIT_PROFILE)

        record.rec_headers.add_header('WARC-Refers-To-Target-URI',
                                      refers_to_uri)
        record.rec_headers.add_header('WARC-Refers-To-Date', refers_to_date)

        record.rec_headers.add_header('WARC-Payload-Digest', digest)

        return record

    def create_record_from_stream(self, record_stream, length):
        warc_headers = self.parser.parse(record_stream)

        return self.create_warc_record('',
                                       warc_headers.get_header('WARC-Type'),
                                       payload=record_stream,
                                       length=length,
                                       warc_headers=warc_headers)

    def create_warc_record(self,
                           uri,
                           record_type,
                           payload=None,
                           length=0,
                           warc_content_type='',
                           warc_headers_dict={},
                           warc_headers=None,
                           http_headers=None):

        if payload and not http_headers and record_type in ('response',
                                                            'request',
                                                            'revisit'):
            http_headers = self.parser.parse(payload)
            length -= payload.tell()

        if not payload:
            payload = BytesIO()
            length = 0

        if not warc_headers:
            warc_headers = self._init_warc_headers(uri, record_type,
                                                   warc_headers_dict)

        # compute Content-Type
        if not warc_content_type:
            warc_content_type = warc_headers.get_header('Content-Type')

            if not warc_content_type:
                warc_content_type = self.WARC_RECORDS.get(record_type)

        record = ArcWarcRecord('warc', record_type, warc_headers, payload,
                               http_headers, warc_content_type, length)

        record.payload_length = length

        if record_type not in ('warcinfo', 'revisit'):
            self.ensure_digest(record, block=False, payload=True)

        return record

    def _init_warc_headers(self, uri, record_type, warc_headers_dict):
        warc_headers = StatusAndHeaders(self.warc_version,
                                        list(warc_headers_dict.items()))
        warc_headers.replace_header('WARC-Type', record_type)
        if not warc_headers.get_header('WARC-Record-ID'):
            warc_headers.add_header('WARC-Record-ID', self._make_warc_id())

        if uri:
            warc_headers.replace_header('WARC-Target-URI', uri)

        if not warc_headers.get_header('WARC-Date'):
            warc_headers.add_header('WARC-Date', self._make_warc_date())

        return warc_headers

    def _set_header_buff(self, record):
        headers_buff = record.http_headers.to_bytes(self.header_filter)
        record.http_headers.headers_buff = headers_buff

    def _write_warc_record(self, out, record, adjust_cl=True):
        if self.gzip:
            out = GzippingWrapper(out)

        if record.http_headers:
            self._set_header_buff(record)

        # ensure digests are set
        if record.rec_type != 'warcinfo':
            self.ensure_digest(record, block=True, payload=False)

        # ensure proper content type
        record.rec_headers.replace_header('Content-Type', record.content_type)

        if record.rec_type == 'revisit':
            http_headers_only = True
        else:
            http_headers_only = False

        # compute Content-Length
        if record.http_headers and record.payload_length >= 0:
            actual_len = 0

            if record.http_headers:
                actual_len = len(record.http_headers.headers_buff)

            if not http_headers_only:
                actual_len += record.payload_length

            record.length = actual_len

        record.rec_headers.replace_header('Content-Length', str(record.length))

        # write record headers
        out.write(record.rec_headers.to_bytes())

        # write headers buffer, if any
        if record.http_headers:
            out.write(record.http_headers.headers_buff)

        if not http_headers_only:
            try:
                for buf in self._iter_stream(record.raw_stream):
                    out.write(buf)
            finally:
                if hasattr(record, '_orig_stream'):
                    record.raw_stream.close()
                    record.raw_stream = record._orig_stream

        # add two lines
        out.write(b'\r\n\r\n')

        out.flush()

    @classmethod
    def _make_warc_id(cls):
        return StatusAndHeadersParser.make_warc_id()

    @classmethod
    def _make_warc_date(cls):
        return datetime_to_iso_date(datetime.datetime.utcnow())

    @classmethod
    def _create_temp_file(cls):
        return tempfile.SpooledTemporaryFile(max_size=512 * 1024)
Ejemplo n.º 6
0
class WARCPathLoader(DefaultResolverMixin, BaseLoader):
    def __init__(self, paths, cdx_source):
        self.paths = paths

        self.resolvers = self.make_resolvers(self.paths)

        self.resolve_loader = ResolvingLoader(self.resolvers,
                                              no_record_parse=True)

        self.headers_parser = StatusAndHeadersParser([], verify=False)

        self.cdx_source = cdx_source

    def load_resource(self, cdx, params):
        if cdx.get('_cached_result'):
            return cdx.get('_cached_result')

        if not cdx.get('filename') or cdx.get('offset') is None:
            return None

        orig_source = cdx.get('source', '').split(':')[0]
        formatter = ParamFormatter(params, orig_source)
        cdx._formatter = formatter

        def local_index_query(local_params):
            for n, v in six.iteritems(params):
                if n.startswith('param.'):
                    local_params[n] = v

            cdx_iter, errs = self.cdx_source(local_params)
            for cdx in cdx_iter:
                cdx._formatter = formatter
                yield cdx

        failed_files = []
        headers, payload = (self.resolve_loader.load_headers_and_payload(
            cdx, failed_files, local_index_query))

        http_headers_buff = None
        if payload.rec_type in ('response', 'revisit'):
            status = cdx.get('status')

            # if status is not set and not, 2xx, 4xx, 5xx
            # go through self-redirect check just in case
            if not status or not status.startswith(('2', '4', '5')):
                http_headers = self.headers_parser.parse(payload.raw_stream)
                try:
                    orig_size = payload.raw_stream.tell()
                except:
                    orig_size = 0

                try:
                    self.raise_on_self_redirect(
                        params, cdx, http_headers.get_statuscode(),
                        http_headers.get_header('Location'))
                except LiveResourceException:
                    no_except_close(headers.raw_stream)
                    no_except_close(payload.raw_stream)
                    raise

                http_headers_buff = http_headers.to_bytes()

                # if new http_headers_buff is different length,
                # attempt to adjust content-length on the WARC record
                if orig_size and len(http_headers_buff) != orig_size:
                    orig_cl = payload.rec_headers.get_header('Content-Length')
                    if orig_cl:
                        new_cl = int(orig_cl) + (len(http_headers_buff) -
                                                 orig_size)
                        payload.rec_headers.replace_header(
                            'Content-Length', str(new_cl))

        warc_headers = payload.rec_headers

        if headers != payload:
            warc_headers.replace_header(
                'WARC-Refers-To-Target-URI',
                payload.rec_headers.get_header('WARC-Target-URI'))

            warc_headers.replace_header(
                'WARC-Refers-To-Date',
                payload.rec_headers.get_header('WARC-Date'))

            warc_headers.replace_header(
                'WARC-Target-URI',
                headers.rec_headers.get_header('WARC-Target-URI'))

            warc_headers.replace_header(
                'WARC-Date', headers.rec_headers.get_header('WARC-Date'))
            no_except_close(headers.raw_stream)

        return (warc_headers, http_headers_buff, payload.raw_stream)

    def __str__(self):
        return 'WARCPathLoader'
Ejemplo n.º 7
0
    def __init__(self, env):
        self.env = env

        parser = StatusAndHeadersParser([], verify=False)

        self.status_headers = parser.parse(self.env['wsgi.input'])
Ejemplo n.º 8
0
class WARCPathLoader(DefaultResolverMixin, BaseLoader):
    def __init__(self, paths, cdx_source):
        self.paths = paths

        self.resolvers = self.make_resolvers(self.paths)

        self.resolve_loader = ResolvingLoader(self.resolvers,
                                              no_record_parse=True)

        self.headers_parser = StatusAndHeadersParser([], verify=False)

        self.cdx_source = cdx_source

    def load_resource(self, cdx, params):
        if cdx.get('_cached_result'):
            return cdx.get('_cached_result')

        if not cdx.get('filename') or cdx.get('offset') is None:
            return None

        orig_source = cdx.get('source', '').split(':')[0]
        formatter = ParamFormatter(params, orig_source)
        cdx._formatter = formatter

        def local_index_query(local_params):
            for n, v in six.iteritems(params):
                if n.startswith('param.'):
                    local_params[n] = v

            cdx_iter, errs = self.cdx_source(local_params)
            for cdx in cdx_iter:
                cdx._formatter = formatter
                yield cdx

        failed_files = []
        headers, payload = (self.resolve_loader.
                             load_headers_and_payload(cdx,
                                                      failed_files,
                                                      local_index_query))

        http_headers_buff = None
        if payload.rec_type in ('response', 'revisit'):
            status = cdx.get('status')
            # status may not be set for 'revisit'
            if not status or status.startswith('3'):
                http_headers = self.headers_parser.parse(payload.raw_stream)

                try:
                    self.raise_on_self_redirect(params, cdx,
                                                http_headers.get_statuscode(),
                                                http_headers.get_header('Location'))
                except LiveResourceException:
                    headers.raw_stream.close()
                    payload.raw_stream.close()
                    raise

                http_headers_buff = http_headers.to_bytes()

        warc_headers = payload.rec_headers

        if headers != payload:
            warc_headers.replace_header('WARC-Refers-To-Target-URI',
                     payload.rec_headers.get_header('WARC-Target-URI'))

            warc_headers.replace_header('WARC-Refers-To-Date',
                     payload.rec_headers.get_header('WARC-Date'))

            warc_headers.replace_header('WARC-Target-URI',
                     headers.rec_headers.get_header('WARC-Target-URI'))

            warc_headers.replace_header('WARC-Date',
                     headers.rec_headers.get_header('WARC-Date'))

            headers.raw_stream.close()

        return (warc_headers, http_headers_buff, payload.raw_stream)

    def __str__(self):
        return  'WARCPathLoader'
Ejemplo n.º 9
0
class ArcWarcRecordLoader(object):
    WARC_TYPES = ['WARC/1.1', 'WARC/1.0', 'WARC/0.17', 'WARC/0.18']

    HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1']

    HTTP_VERBS = ['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE',
                  'OPTIONS', 'CONNECT', 'PATCH']

    HTTP_RECORDS = ('response', 'request', 'revisit')

    NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
    HTTP_SCHEMES = ('http:', 'https:')

    def __init__(self, verify_http=True, arc2warc=True):
        if arc2warc:
            self.arc_parser = ARC2WARCHeadersParser()
        else:
            self.arc_parser = ARCHeadersParser()

        self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
        self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)

        self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)

    def parse_record_stream(self, stream,
                            statusline=None,
                            known_format=None,
                            no_record_parse=False,
                            ensure_http_headers=False,
                            check_digests=False):
        """ Parse file-like stream and return an ArcWarcRecord
        encapsulating the record headers, http headers (if any),
        and a stream limited to the remainder of the record.

        Pass statusline and known_format to detect_type_loader_headers()
        to faciliate parsing.
        """
        (the_format, rec_headers) = (self.
                                     _detect_type_load_headers(stream,
                                                               statusline,
                                                               known_format))

        if the_format == 'arc':
            uri = rec_headers.get_header('uri')
            length = rec_headers.get_header('length')
            content_type = rec_headers.get_header('content-type')
            sub_len = rec_headers.total_len
            if uri and uri.startswith('filedesc://'):
                rec_type = 'arc_header'
            else:
                rec_type = 'response'

        elif the_format in ('warc', 'arc2warc'):
            rec_type = rec_headers.get_header('WARC-Type')
            uri = self._ensure_target_uri_format(rec_headers)
            length = rec_headers.get_header('Content-Length')
            content_type = rec_headers.get_header('Content-Type')
            if the_format == 'warc':
                sub_len = 0
            else:
                sub_len = rec_headers.total_len
                the_format = 'warc'

        is_err = False

        try:
            if length is not None:
                length = int(length) - sub_len
                if length < 0:
                    is_err = True

        except (ValueError, TypeError):
            is_err = True

        # err condition
        if is_err:
            length = 0

        is_verifying = False
        digest_checker = DigestChecker(check_digests)

        # limit stream to the length for all valid records
        if length is not None and length >= 0:
            stream = LimitReader.wrap_stream(stream, length)
            if check_digests:
                stream, is_verifying = self.wrap_digest_verifying_stream(stream, rec_type,
                                                                         rec_headers, digest_checker,
                                                                         length=length)

        http_headers = None
        payload_length = -1

        # load http headers if parsing
        if not no_record_parse:
            start = stream.tell()
            http_headers = self.load_http_headers(rec_type, uri, stream, length)
            if length and http_headers:
                payload_length = length - (stream.tell() - start)

        # generate validate http headers (eg. for replay)
        if not http_headers and ensure_http_headers:
            http_headers = self.default_http_headers(length, content_type)

        if is_verifying:
            stream.begin_payload()

        return ArcWarcRecord(the_format, rec_type,
                             rec_headers, stream, http_headers,
                             content_type, length, payload_length=payload_length, digest_checker=digest_checker)

    def wrap_digest_verifying_stream(self, stream, rec_type, rec_headers, digest_checker, length=None):
        payload_digest = rec_headers.get_header('WARC-Payload-Digest')
        block_digest = rec_headers.get_header('WARC-Block-Digest')
        segment_number = rec_headers.get_header('WARC-Segment-Number')

        if not payload_digest and not block_digest:
            return stream, False

        stream = DigestVerifyingReader(stream, length, digest_checker,
                                       record_type=rec_type,
                                       payload_digest=payload_digest,
                                       block_digest=block_digest,
                                       segment_number=segment_number)
        return stream, True

    def load_http_headers(self, rec_type, uri, stream, length):
        # only if length == 0 don't parse
        # try parsing is length is unknown (length is None) or length > 0
        if length == 0:
            return None

        # only certain record types can have http headers
        if rec_type not in self.HTTP_RECORDS:
            return None

        # only http:/https: uris can have http headers
        if not uri.startswith(self.HTTP_SCHEMES):
            return None

        # request record: parse request
        if rec_type == 'request':
            return self.http_req_parser.parse(stream)

        elif rec_type == 'revisit':
            try:
                return self.http_parser.parse(stream)
            except EOFError:
                # empty revisit with no http headers, is ok!
                return None

        # response record or non-empty revisit: parse HTTP status and headers!
        else:
            return self.http_parser.parse(stream)

    def default_http_headers(self, length, content_type=None):
        headers = []
        if content_type:
            headers.append(('Content-Type', content_type))

        if length is not None and length >= 0:
            headers.append(('Content-Length', str(length)))

        return StatusAndHeaders('200 OK', headers=headers, protocol='HTTP/1.0')

    def _detect_type_load_headers(self, stream,
                                  statusline=None, known_format=None):
        """ If known_format is specified ('warc' or 'arc'),
        parse only as that format.

        Otherwise, try parsing record as WARC, then try parsing as ARC.
        if neither one succeeds, we're out of luck.
        """

        if known_format != 'arc':
            # try as warc first
            try:
                rec_headers = self.warc_parser.parse(stream, statusline)
                return 'warc', rec_headers
            except StatusAndHeadersParserException as se:
                if known_format == 'warc':
                    msg = 'Invalid WARC record, first line: '
                    raise ArchiveLoadFailed(msg + str(se.statusline))

                statusline = se.statusline
                pass

        # now try as arc
        try:
            rec_headers = self.arc_parser.parse(stream, statusline)
            return self.arc_parser.get_rec_type(), rec_headers
        except StatusAndHeadersParserException as se:
            if known_format == 'arc':
                msg = 'Invalid ARC record, first line: '
            else:
                msg = 'Unknown archive format, first line: '
            raise ArchiveLoadFailed(msg + str(se.statusline))

    def _ensure_target_uri_format(self, rec_headers):
        """Checks the value for the WARC-Target-URI header field to see if it starts
        with '<' and ends with '>' (Wget 1.19 bug) and if '<' and '>' are present,
        corrects and updates the field returning the corrected value for the field
        otherwise just returns the fields value. Also checks for the presence of
        spaces and percent-encodes them if present, for more reliable parsing
        downstream.

        :param StatusAndHeaders rec_headers: The parsed WARC headers
        :return: The value for the WARC-Target-URI field
        :rtype: str | None
        """
        uri = rec_headers.get_header('WARC-Target-URI')

        if uri is not None and uri.startswith('<') and uri.endswith('>'):
            uri = uri[1:-1]
            rec_headers.replace_header('WARC-Target-URI', uri)

        if uri is not None and " " in uri:
            logger.warning("Replacing spaces in invalid WARC-Target-URI: {}".format(uri))
            uri = uri.replace(" ", "%20")
            rec_headers.replace_header('WARC-Target-URI', uri)

        return uri
Ejemplo n.º 10
0
class ArcWarcRecordLoader(object):
    WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']

    HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1']

    HTTP_VERBS = [
        'GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE', 'OPTIONS', 'CONNECT',
        'PATCH'
    ]

    HTTP_RECORDS = ('response', 'request', 'revisit')

    NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
    HTTP_SCHEMES = ('http:', 'https:')

    def __init__(self, verify_http=True, arc2warc=True):
        if arc2warc:
            self.arc_parser = ARC2WARCHeadersParser()
        else:
            self.arc_parser = ARCHeadersParser()

        self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
        self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)

        self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS,
                                                      verify_http)

    def parse_record_stream(self,
                            stream,
                            statusline=None,
                            known_format=None,
                            no_record_parse=False,
                            ensure_http_headers=False):
        """ Parse file-like stream and return an ArcWarcRecord
        encapsulating the record headers, http headers (if any),
        and a stream limited to the remainder of the record.

        Pass statusline and known_format to detect_type_loader_headers()
        to faciliate parsing.
        """
        (the_format, rec_headers) = (self._detect_type_load_headers(
            stream, statusline, known_format))

        if the_format == 'arc':
            uri = rec_headers.get_header('uri')
            length = rec_headers.get_header('length')
            content_type = rec_headers.get_header('content-type')
            sub_len = rec_headers.total_len
            if uri and uri.startswith('filedesc://'):
                rec_type = 'arc_header'
            else:
                rec_type = 'response'

        elif the_format in ('warc', 'arc2warc'):
            rec_type = rec_headers.get_header('WARC-Type')
            uri = rec_headers.get_header('WARC-Target-URI')
            length = rec_headers.get_header('Content-Length')
            content_type = rec_headers.get_header('Content-Type')
            if the_format == 'warc':
                sub_len = 0
            else:
                sub_len = rec_headers.total_len
                the_format = 'warc'

        is_err = False

        try:
            if length is not None:
                length = int(length) - sub_len
                if length < 0:
                    is_err = True

        except (ValueError, TypeError):
            is_err = True

        # err condition
        if is_err:
            length = 0

        # limit stream to the length for all valid records
        if length is not None and length >= 0:
            stream = LimitReader.wrap_stream(stream, length)

        http_headers = None

        # load http headers if parsing
        if not no_record_parse:
            http_headers = self.load_http_headers(rec_type, uri, stream,
                                                  length)

        # generate validate http headers (eg. for replay)
        if not http_headers and ensure_http_headers:
            http_headers = self.default_http_headers(length, content_type)

        return ArcWarcRecord(the_format, rec_type, rec_headers, stream,
                             http_headers, content_type, length)

    def load_http_headers(self, rec_type, uri, stream, length):
        # only if length == 0 don't parse
        # try parsing is length is unknown (length is None) or length > 0
        if length == 0:
            return None

        # only certain record types can have http headers
        if rec_type not in self.HTTP_RECORDS:
            return None

        # only http:/https: uris can have http headers
        if not uri.startswith(self.HTTP_SCHEMES):
            return None

        # request record: parse request
        if rec_type == 'request':
            return self.http_req_parser.parse(stream)

        elif rec_type == 'revisit':
            try:
                return self.http_parser.parse(stream)
            except EOFError:
                # empty revisit with no http headers, is ok!
                return None

        # response record or non-empty revisit: parse HTTP status and headers!
        else:
            return self.http_parser.parse(stream)

    def default_http_headers(self, length, content_type=None):
        headers = []
        if content_type:
            headers.append(('Content-Type', content_type))

        if length is not None and length >= 0:
            headers.append(('Content-Length', str(length)))

        return StatusAndHeaders('200 OK', headers=headers, protocol='HTTP/1.0')

    def _detect_type_load_headers(self,
                                  stream,
                                  statusline=None,
                                  known_format=None):
        """ If known_format is specified ('warc' or 'arc'),
        parse only as that format.

        Otherwise, try parsing record as WARC, then try parsing as ARC.
        if neither one succeeds, we're out of luck.
        """

        if known_format != 'arc':
            # try as warc first
            try:
                rec_headers = self.warc_parser.parse(stream, statusline)
                return 'warc', rec_headers
            except StatusAndHeadersParserException as se:
                if known_format == 'warc':
                    msg = 'Invalid WARC record, first line: '
                    raise ArchiveLoadFailed(msg + str(se.statusline))

                statusline = se.statusline
                pass

        # now try as arc
        try:
            rec_headers = self.arc_parser.parse(stream, statusline)
            return self.arc_parser.get_rec_type(), rec_headers
        except StatusAndHeadersParserException as se:
            if known_format == 'arc':
                msg = 'Invalid ARC record, first line: '
            else:
                msg = 'Unknown archive format, first line: '
            raise ArchiveLoadFailed(msg + str(se.statusline))