Beispiel #1
0
class CCWARCWriter:
    def __init__(self,
                 prefix,
                 max_size,
                 subprefix=None,
                 gzip=True,
                 get_serial=None):
        self.writer = None
        self.prefix = prefix
        self.subprefix = subprefix
        self.max_size = max_size
        self.gzip = gzip
        self.hostname = socket.gethostname()
        if get_serial is not None:
            self.external_get_serial = get_serial
        else:
            self.external_get_serial = None
            self.serial = 0

    def __del__(self):
        if self.writer is not None:
            self.f.close()

    def create_default_info(self,
                            version,
                            warcheader_version,
                            ip,
                            description=None,
                            creator=None,
                            operator=None):
        '''
        creator:  # person, organization, service
        operator:  # person, if creator is an organization
        isPartOf:  # name of the crawl
        '''
        info = OrderedDict()

        info[
            'software'] = 'cocrawler/' + version + ' cocrawler_warcheader_version/' + warcheader_version
        info['hostname'] = self.hostname
        info['ip'] = ip
        if description:
            info['description'] = description
        if creator:
            info['creator'] = creator
        if operator:
            info['operator'] = operator
        info[
            'isPartOf'] = self.prefix  # intentionally does not include subprefix
        info['format'] = 'WARC file version 1.0'
        self.info = info
        return info

    def open(self):
        filename = self.prefix
        if self.subprefix:
            filename += '-' + str(
                self.subprefix)  # don't let yaml leave this as an int
        serial = self.get_serial(filename)
        filename += '-' + serial + '-' + self.hostname + '.warc'
        if self.gzip:
            filename += '.gz'
        self.filename = filename
        self.f = open(filename, 'wb')
        self.writer = WARCWriter(self.f, gzip=self.gzip)
        record = self.writer.create_warcinfo_record(self.filename, self.info)
        self.writer.write_record(record)

    def get_serial(self, filename):
        if self.external_get_serial is not None:
            return self.external_get_serial(filename)
        self.serial += 1
        return '{:06}'.format(self.serial - 1)

    def maybe_close(self):
        '''
        TODO: always close/reopen if subprefix is not None; to minimize open filehandles?
        '''
        fsize = os.fstat(self.f.fileno()).st_size
        if fsize > self.max_size:
            self.f.close()
            self.writer = None

    def write_dns(self, dns, ttl, url):
        # write it out even if empty
        # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse?

        # the response object doesn't contain the query type 'A' or 'AAAA'
        # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A'
        kind = 'A'  # fixme IPV6

        ttl = int(ttl)
        host = url.hostname

        if self.writer is None:
            self.open()

        payload = timestamp_now() + '\r\n'

        for r in dns:
            try:
                payload += '\t'.join(
                    (host + '.', str(ttl), 'IN', kind, r['host'])) + '\r\n'
            except Exception as e:
                LOGGER.info('problem converting dns reply for warcing', host,
                            r, e)
                pass
        payload = payload.encode('utf-8')

        record = self.writer.create_warc_record('dns:' + host,
                                                'resource',
                                                payload=BytesIO(payload),
                                                warc_content_type='text/dns',
                                                length=len(payload))

        self.writer.write_record(record)
        LOGGER.debug('wrote warc dns response record%s for host %s',
                     p(self.prefix), host)
        stats.stats_sum('warc dns' + p(self.prefix), 1)

    def _fake_resp_headers(self, resp_headers, body_len, decompressed=False):
        prefix = b'X-Crawler-'
        ret = []
        for h, v in resp_headers:
            hl = h.lower()
            if hl == b'content-length':
                if not (v.isdigit() and int(v) == body_len):
                    ret.append((prefix + h, v))
                    ret.append((b'Content-Length', str(body_len)))
            elif hl == b'content-encoding':
                if decompressed:
                    ret.append((prefix + h, v))
                else:
                    ret.append((h, v))
            elif hl == b'transfer-encoding':
                if v.lower() == b'chunked':
                    # aiohttp always undoes chunking
                    ret.append((prefix + h, v))
                else:
                    ret.append((h, v))
            else:
                ret.append((h, v))
        return ret

    def write_request_response_pair(self,
                                    url,
                                    ip,
                                    req_headers,
                                    resp_headers,
                                    is_truncated,
                                    payload,
                                    digest=None,
                                    decompressed=False):
        if self.writer is None:
            self.open()

        req_http_headers = StatusAndHeaders('GET / HTTP/1.1', req_headers)

        request = self.writer.create_warc_record('http://example.com/',
                                                 'request',
                                                 http_headers=req_http_headers)

        fake_resp_headers = self._fake_resp_headers(resp_headers,
                                                    len(payload),
                                                    decompressed=decompressed)
        resp_http_headers = StatusAndHeaders('200 OK',
                                             fake_resp_headers,
                                             protocol='HTTP/1.1')

        warc_headers_dict = OrderedDict()
        if ip is not None:
            # ip should be here unless we crawl through a proxy
            warc_headers_dict['WARC-IP-Address'] = ip
        if digest is not None:
            warc_headers_dict['WARC-Payload-Digest'] = digest
        if is_truncated:
            if is_truncated in valid_truncations:
                warc_headers_dict['WARC-Truncated'] = is_truncated
            else:
                LOGGER.error('Invalid is_truncation of ' + is_truncated)
                warc_headers_dict['WARC-Truncated'] = 'unspecified'

        response = self.writer.create_warc_record(
            url,
            'response',
            payload=BytesIO(payload),
            length=len(payload),
            warc_headers_dict=warc_headers_dict,
            http_headers=resp_http_headers)

        self.writer.write_request_response_pair(request, response)
        self.maybe_close()
        LOGGER.debug('wrote warc request-response pair%s for url %s',
                     p(self.prefix), url)
        stats.stats_sum('warc r/r' + p(self.prefix), 1)
Beispiel #2
0
class HarParser(object):
    logger = logging.getLogger(__name__)

    def __init__(self, reader, writer, gzip=True):
        if isinstance(reader, str):
            with codecs.open(reader, encoding='utf-8') as fh:
                self.har = json.loads(fh.read())
        elif hasattr(reader, 'read'):
            self.har = json.loads(reader.read())
        elif isinstance(reader, dict):
            self.har = reader
        else:
            raise Exception('reader is in an unknown format')

        self.fh = None
        if isinstance(writer, BaseWARCWriter):
            self.writer = writer
        elif isinstance(writer, str):
            self.fh = open(writer, 'wb')
            self.writer = WARCWriter(self.fh, gzip=gzip)
        elif hasattr(writer, 'write'):
            self.writer = WARCWriter(writer, gzip=gzip)
        else:
            raise Exception('writer is in an unknown format')

    def parse(self, out_filename=None, rec_title=None):
        out_filename = out_filename or 'har.warc.gz'
        rec_title = rec_title or 'HAR Recording'
        metadata = self.create_wr_metadata(self.har['log'], rec_title)
        self.write_warc_info(self.har['log'], out_filename, metadata)

        for entry in self.har['log']['entries']:
            self.parse_entry(entry)

        if self.fh:
            self.fh.close()

    def parse_entry(self, entry):
        url = entry['request']['url']

        response = self.parse_response(url,
                                        entry['response'],
                                        entry.get('serverIPAddress'))

        #TODO: support WARC/1.1 arbitrary precision dates!
        warc_date = entry['startedDateTime'][:19] + 'Z'

        response.rec_headers.replace_header('WARC-Date', warc_date)

        request = self.parse_request(entry['request'])

        self.writer.write_request_response_pair(request, response)


    def create_wr_metadata(self, log, rec_title):
        pagelist = []

        for page in log['pages']:
            if not page['title'].startswith(('http:', 'https:')):
                continue

            pagelist.append(dict(title=page['title'],
                                 url=page['title'],
                                 timestamp=iso_date_to_timestamp(page['startedDateTime'])))

        metadata = {"title": rec_title,
                    "type": "recording",
                   }

        if pagelist:
            metadata["pages"] = pagelist

        return metadata

    def write_warc_info(self, log, filename, metadata):
        creator = '{0} {1}'.format(log['creator']['name'],
                                   log['creator']['version'])

        source = 'HAR Format {0}'.format(log['version'])

        software = 'har2warc ' + str(__version__)

        params = OrderedDict([('software', software),
                              ('creator', creator),
                              ('source', source),
                              ('format', 'WARC File Format 1.0'),
                              ('json-metadata', json.dumps(metadata))])

        record = self.writer.create_warcinfo_record(filename, params)
        self.writer.write_record(record)

    def _get_http_version(self, entry):
        http_version = entry.get('httpVersion')
        if not http_version or http_version.upper() not in ('HTTP/1.1', 'HTTP/1.0'):
            http_version = 'HTTP/1.1'

        return http_version

    def parse_response(self, url, response, ip=None):
        headers = []
        payload = BytesIO()
        content = response['content'].get('text', '')

        if not content and not response.get('headers'):
            self.logger.info('No headers or payload for: {0}'.format(url))
            headers.append(('Content-Length', '0'))
        if response['content'].get('encoding') == 'base64':
            payload.write(base64.b64decode(content))
        else:
            payload.write(content.encode('utf-8'))

        length = payload.tell()
        payload.seek(0)

        SKIP_HEADERS = ('content-encoding', 'transfer-encoding')

        http2 = False

        for header in response['headers']:
            if header['name'].lower() not in SKIP_HEADERS:
                headers.append((header['name'], header['value']))

            #TODO: http2 detection -- write as same warc header?
            if (not http2 and
                header['name'] in (':method', ':scheme', ':path')):
                http2 = True

        status = response.get('status') or 204

        reason = response.get('statusText')
        if not reason:
            reason = http_status_names.get(status, 'No Reason')

        status_line = str(status) + ' ' + reason

        proto = self._get_http_version(response)

        http_headers = StatusAndHeaders(status_line, headers, protocol=proto)

        if not content:
            content_length = http_headers.get_header('Content-Length', '0')
            if content_length != '0':
                self.logger.info('No Content for length {0} {1}'.format(content_length, url))
                http_headers.replace_header('Content-Length', '0')
        else:
            http_headers.replace_header('Content-Length', str(length))

        warc_headers_dict = {}
        if ip:
            warc_headers_dict['WARC-IP-Address'] = ip

        record = self.writer.create_warc_record(url, 'response',
                                                http_headers=http_headers,
                                                payload=payload,
                                                length=length,
                                                warc_headers_dict=warc_headers_dict)

        return record

    def parse_request(self, request):
        parts = urlsplit(request['url'])

        path = parts.path
        query = request.get('queryString')
        if query:
            path += '?' + urlencode(dict((p['name'], p['value'])
                                    for p in query))

        headers = []
        http2 = False

        for header in request['headers']:
            headers.append((header['name'], header['value']))

            #TODO: http2 detection -- write as same warc header?
            if (not http2 and
                header['name'] in (':method', ':scheme', ':path')):
                http2 = True

        if http2:
            headers.append(('Host', parts.netloc))

        http_version = self._get_http_version(request)

        status_line = request['method'] + ' ' + path + ' ' + http_version
        http_headers = StatusAndHeaders(status_line, headers)

        payload = None
        length = 0

        if request['bodySize'] > 0:
            payload = BytesIO()
            payload.write(request['postData']['text'].encode('utf-8'))
            length = payload.tell()
            payload.seek(0)

        record = self.writer.create_warc_record(request['url'], 'request',
                                                http_headers=http_headers,
                                                payload=payload,
                                                length=length)

        return record
Beispiel #3
0
class CCWARCWriter:
    def __init__(self,
                 prefix,
                 max_size,
                 subprefix=None,
                 gzip=True,
                 get_serial=None):
        self.writer = None
        self.prefix = prefix
        self.subprefix = subprefix
        self.max_size = max_size
        self.gzip = gzip
        self.hostname = socket.gethostname()
        if get_serial is not None:
            self.external_get_serial = get_serial
        else:
            self.external_get_serial = None
            self.serial = 0

    def __del__(self):
        if self.writer is not None:
            self.f.close()

    def create_default_info(self,
                            version,
                            ip,
                            description=None,
                            creator=None,
                            operator=None):
        '''
        creator:  # person, organization, service
        operator:  # person, if creator is an organization
        isPartOf:  # name of the crawl
        '''
        info = OrderedDict()

        info['software'] = 'cocrawler/' + version
        info['hostname'] = self.hostname
        info['ip'] = ip
        if description:
            info['description'] = description
        if creator:
            info['creator'] = creator
        if operator:
            info['operator'] = operator
        info[
            'isPartOf'] = self.prefix  # intentionally does not include subprefix
        info['format'] = 'WARC file version 1.0'
        self.info = info
        return info

    def open(self):
        filename = self.prefix
        if self.subprefix:
            filename += '-' + self.subprefix
        serial = self.get_serial(filename)
        filename += '-' + serial + '-' + self.hostname + '.warc'
        if self.gzip:
            filename += '.gz'
        self.filename = filename
        self.f = open(filename, 'wb')
        self.writer = WARCWriter(self.f, gzip=self.gzip)
        record = self.writer.create_warcinfo_record(self.filename, self.info)
        self.writer.write_record(record)

    def get_serial(self, filename):
        if self.external_get_serial is not None:
            return self.external_get_serial(filename)
        self.serial += 1
        return '{:06}'.format(self.serial - 1)

    def maybe_close(self):
        '''
        TODO: always close/reopen if subprefix is not None; minimizes open filehandles
        '''
        fsize = os.fstat(self.f.fileno()).st_size
        if fsize > self.max_size:
            self.f.close()
            self.writer = None

    def write_dns(self, dns, expires, url):
        # write it out even if empty
        # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse?

        # the response object doesn't contain the query type 'A' or 'AAAA'
        # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A'
        kind = 'A'  # fixme IPV6

        ttl = int(expires - time.time())
        host = url.hostname

        if self.writer is None:
            self.open()

        payload = timestamp_now() + '\r\n'

        for r in dns:
            try:
                payload += host + '.\t' + str(
                    ttl) + '\tIN\t' + kind + '\t' + r['host'] + '\r\n'
            except Exception as e:
                LOGGER.info('problem converting dns reply for warcing', host,
                            r, e)
                pass
        payload = payload.encode('utf-8')

        record = self.writer.create_warc_record('dns:' + host,
                                                'resource',
                                                payload=BytesIO(payload),
                                                warc_content_type='text/dns',
                                                length=len(payload))

        self.writer.write_record(record)
        LOGGER.debug('wrote warc dns response record%s for host %s',
                     p(self.prefix), host)
        stats.stats_sum('warc dns' + p(self.prefix), 1)

    def write_request_response_pair(self,
                                    url,
                                    req_headers,
                                    resp_headers,
                                    is_truncated,
                                    payload,
                                    digest=None):
        if self.writer is None:
            self.open()

        # XXX WARC-Identified-Payload-Type set from Apache Tika? (done by Common Crawl) (how expensive?)

        req_http_headers = StatusAndHeaders(
            'GET / HTTP/1.1', headers_to_str_headers(req_headers))

        request = self.writer.create_warc_record('http://example.com/',
                                                 'request',
                                                 http_headers=req_http_headers)

        resp_http_headers = StatusAndHeaders(
            '200 OK',
            headers_to_str_headers(resp_headers),
            protocol='HTTP/1.1')

        warc_headers_dict = {}
        if digest is not None:
            warc_headers_dict['WARC-Payload-Digest'] = digest
        if is_truncated:
            if is_truncated in valid_truncations:
                warc_headers_dict['WARC-Truncated'] = is_truncated
            else:
                LOGGER.error('Invalid is_truncation of ' + is_truncated)
                warc_headers_dict['WARC-Truncated'] = 'unspecified'

        response = self.writer.create_warc_record(
            url,
            'response',
            payload=BytesIO(payload),
            length=len(payload),
            warc_headers_dict=warc_headers_dict,
            http_headers=resp_http_headers)

        self.writer.write_request_response_pair(request, response)
        self.maybe_close()
        LOGGER.debug('wrote warc request-response pair%s for url %s',
                     p(self.prefix), url)
        stats.stats_sum('warc r/r' + p(self.prefix), 1)