Beispiel #1
0
    def build_writer(self):
        """
        Initialize a new WARC file and write the "warcinfo" header.
        """
        directory = self.settings.get('WARC_FILE_DIRECTORY', '.')
        filename = self.build_filename()

        if self.debug:
            fp = sys.stdout.buffer
        else:
            fp = open(os.path.join(directory, filename), 'wb')

        logger.debug(f"Generating WARC file {filename}")
        writer = WARCWriter(
            fp,
            gzip=self.settings.getbool('WARC_GZIP', True),
            warc_version=self.settings['WARC_VERSION'],
        )

        headers = {
            'hostname': self.hostname,
            'ip': self.ip_address,
            'http-header-user-agent': self.settings["USER_AGENT"],
            'robots': 'classic' if self.settings["ROBOTSTXT_OBEY"] else 'none',
            'operator': self.settings.get("WARC_OPERATOR"),
            'software': self.settings.get("WARC_SOFTWARE"),
            'isPartOf': self.settings.get("WARC_IS_PART_OF"),
            'description': self.settings.get("WARC_DESCRIPTION"),
            'format': self.settings.get("WARC_FORMAT"),
            'conformsTo': self.settings.get("WARC_CONFORMS_TO"),
        }
        warcinfo_record = writer.create_warcinfo_record(filename, headers)
        writer.write_record(warcinfo_record)
        return writer
    fo = WARCWriter(open(options.output, 'wb'), gzip=True)

if options.pdfpass is not None:
    po = WARCWriter(open(options.pdfpass, 'wb'), gzip=True)

if not options.pdfpass and options.pdfextract:
    extractor = ExtrP()

cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False)

if options.output == sys.stdout:
    filename = options.input
else:
    filename = options.output

fo.write_record(fo.create_warcinfo_record(filename=filename, info={'software': 'bitextor/bitextor-warc2htmlwarc.py', 'format': 'WARC File Format 1.0'}))

for record in f:
    # Initial checks
    if record.rec_type != 'response' and record.rec_type != 'resource':
        continue
    if record.rec_headers.get_header('WARC-Target-URI')[0] == '<' and record.rec_headers.get_header('WARC-Target-URI')[-1] == '>':
        url = record.rec_headers.get_header('WARC-Target-URI')[1:-1]
    else:
        url = record.rec_headers.get_header('WARC-Target-URI')
    if url == "unknown":
        logging.info("Skipping page with unknown URL")
        continue
    if "text/dns" in record.rec_headers.get_header('Content-Type'):
        continue
    pageSize = int(record.rec_headers.get_header('Content-Length'))
Beispiel #3
0
class MHTML2WARC:
    logger = logging.getLogger(__name__)

    def __init__(self, writer, gzip=True):
        self.fh = None
        self.writer = None
        self.filename = 'unknown'
        self.is_first = True

        if isinstance(writer, BaseWARCWriter):
            self.writer = writer
        elif isinstance(writer, str):
            self.fh = open(writer, 'wb')
            self.filename = writer
            self.writer = WARCWriter(self.fh, gzip=gzip)
        elif hasattr(writer, 'write'):
            self.writer = WARCWriter(writer, gzip=gzip)
        else:
            raise Exception('writer is in an unknown format')

    def parse(self, input_):
        if isinstance(input_, str):
            with open(input_, 'rb') as rfh:
                message = email.message_from_binary_file(rfh, policy=email.policy.strict)
        elif hasattr(input_, 'read'):
            message = email.message_from_binary_file(input_, policy=email.policy.strict)
        else:
            raise Exception('input is in an unknown format')

        if not message.is_multipart():
            raise Exception('Invalid MHTML -- not multipart')


        main_url = message.get('Snapshot-Content-Location', '')

        warc_date = self.write_warc_info(message)

        for part in message.walk():
            if part.get_content_type() == 'multipart/related':
                continue

            self.write_resource(part, main_url, warc_date)

    def write_resource(self, part, main_url, warc_date):
        content_type = part.get_content_type()
        main_type = part.get_content_maintype()
        content = part.get_payload(decode=True)

        url = part.get('Content-Location')

        warc_headers = {'WARC-Date': warc_date,
                        'WARC-Creation-Date': self.writer._make_warc_date(),
                       }

        content_id = part.get('Content-ID')
        write_redir = False

        if content_id:
            warc_headers['Content-ID'] = content_id

            cid_url = 'cid:' + content_id[1:-1]

            # only write main page url once under url
            # there may be additional frames for same url
            # only write them under cid
            if url == main_url:
                if self.is_first:
                    self.is_first = False
                else:
                    url = None

            if not url:
                # if cid urls not allowed, skip this resource
                if not allow_cid_urls:
                    return
                url = cid_url
            else:
                write_redir = True


        record = self.writer.create_warc_record(url, 'resource',
                                  payload=BytesIO(content),
                                  length=len(content),
                                  warc_content_type=content_type,
                                  warc_headers_dict=warc_headers)

        self.writer.write_record(record)

        if write_redir and allow_cid_urls:
            self.add_cid_redirect(cid_url, url)

    def add_cid_redirect(self, cid_url, url):
        msg = b'redirect'

        headers_list = [('Content-Type', 'text/plain'),
                        ('Content-Length', str(len(msg))),
                        ('Location', url)]

        http_headers = StatusAndHeaders('302 Redirect', headers_list, protocol='HTTP/1.0')

        record = self.writer.create_warc_record(cid_url, 'response',
                                  length=len(msg),
                                  payload=BytesIO(msg),
                                  http_headers=http_headers)

        self.writer.write_record(record)

    def write_warc_info(self, message):
        creator = message.get('From', '')

        url = message.get('Snapshot-Content-Location', '')

        title = message.get('Subject', url)


        try:
            actual_date = http_date_to_datetime(message['Date'])
            timestamp = datetime_to_timestamp(actual_date)
        except Exception:
            actual_date = ''
            timestamp = ''

        source = 'MHTML Snapshot for: ' + url

        software = 'mhtml2warc ' + str(__version__)

        metadata = {'title':  source,
                    'type': 'recording',
                    'pages': [{'title': title,
                               'url': url,
                               'timestamp': timestamp}]
                   }

        params = OrderedDict([('software', software),
                              ('creator', creator),
                              ('source', source),
                              ('format', 'WARC File Format 1.0'),
                              ('subject', title),
                              ('json-metadata', json.dumps(metadata))])

        record = self.writer.create_warcinfo_record(self.filename, params)

        if actual_date:
            actual_date = datetime_to_iso_date(actual_date)

            creation_date = record.rec_headers.get('WARC-Date')
            record.rec_headers.replace_header('WARC-Date', actual_date)
            record.rec_headers.replace_header('WARC-Creation-Date', creation_date)

        self.writer.write_record(record)

        return actual_date
class WarcDownloader:
    """
        Download URL with HTTP GET, save to a WARC file and return the decoded text
    """
    def __init__(self,
                 expected_filename,
                 _logger,
                 warcinfo_record_data=None,
                 program_name='WebArticleCurator',
                 user_agent=None,
                 overwrite_warc=True,
                 err_threshold=10,
                 known_bad_urls=None,
                 max_no_of_calls_in_period=2,
                 limit_period=1,
                 proxy_url=None,
                 allow_cookies=False,
                 verify_request=True,
                 stay_offline=False):
        # Store variables
        self._logger = _logger
        self._req_headers = {
            'Accept-Encoding': 'identity',
            'User-agent': user_agent
        }
        self._error_count = 0
        self._error_threshold = err_threshold  # Set the error threshold which cause aborting to prevent deinal

        # Setup download function
        if not stay_offline:
            self.download_url = self._download_url
        else:
            self.download_url = self._dummy_download_url

        if known_bad_urls is not None:  # Setup the list of cached bad URLs to prevent trying to download them again
            with open(known_bad_urls, encoding='UTF-8') as fh:
                self.bad_urls = {line.strip() for line in fh}
        else:
            self.bad_urls = set()

        self.good_urls = set()

        # Setup target file handle
        filename = self._set_target_filename(expected_filename, overwrite_warc)
        self._logger.log('INFO', 'Creating archivefile:', filename)
        self._output_file = open(filename, 'wb')

        self._session = Session()  # Setup session for speeding up downloads
        if proxy_url is not None:  # Set socks proxy if provided
            self._session.proxies['http'] = proxy_url
            self._session.proxies['https'] = proxy_url

        self._allow_cookies = allow_cookies
        self._verify_request = verify_request
        if not self._verify_request:
            disable_warnings(InsecureRequestWarning)

        # Setup rate limiting to prevent hammering the server
        self._requests_get = sleep_and_retry(
            limits(calls=max_no_of_calls_in_period,
                   period=limit_period)(self._http_get_w_cookie_handling))

        self._writer = WARCWriter(self._output_file,
                                  gzip=True,
                                  warc_version='WARC/1.1')
        if warcinfo_record_data is None:  # Or use the parsed else custom headers will not be copied
            # INFO RECORD
            # Some custom information about the warc writer program and its settings
            warcinfo_record_data = {
                'software':
                program_name,
                'arguments':
                ' '.join(sys.argv[1:]),
                'format':
                'WARC File Format 1.1',
                'conformsTo':
                'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1-1_latestdraft.pdf'
            }
        info_record = self._writer.create_warcinfo_record(
            filename, warcinfo_record_data)
        self._writer.write_record(info_record)

    @staticmethod
    def _set_target_filename(filename, overwrite_warc):
        if not overwrite_warc:  # Find out next nonexisting warc filename
            num = 0
            while os.path.exists(filename):
                filename2, ext = os.path.splitext(
                    filename)  # Should be filename.warc.gz
                if ext == '.gz' and filename2.endswith('.warc'):
                    filename2, ext2 = os.path.splitext(
                        filename2)  # Should be filename.warc
                    ext = ext2 + ext  # Should be .warc.gz

                filename = '{0}-{1:05d}{2}'.format(filename2, num, ext)
                num += 1
        return filename

    def __del__(self):
        if hasattr(
                self, '_output_file'
        ):  # If the program opened a file, then it should gracefully close it on exit!
            self._output_file.close()

    def _http_get_w_cookie_handling(self, *args, **kwargs):
        """
            Extend requests.get with optional cookie purging
        """
        if not self._allow_cookies:
            self._session.cookies.clear()
        return self._session.get(*args, **kwargs)

    def _handle_request_exception(self, url, msg):
        self._logger.log('WARNING', url, msg, sep='\t')

        self._error_count += 1
        if self._error_count >= self._error_threshold:
            raise NameError(
                'Too many error happened! Threshold exceeded! See log for details!'
            )

    @staticmethod
    def _get_peer_name(resp):
        # Must get peer_name before the content is read
        # It has no official API for that:
        # https://github.com/kennethreitz/requests/issues/2158
        # https://github.com/urllib3/urllib3/issues/1071
        # So workaround to be compatible with windows:
        # https://stackoverflow.com/questions/22492484/how-do-i-get-the-ip-address-from-a-http-request-using-the-\
        # requests-library/22513161#22513161
        try:
            peer_name = resp.raw._connection.sock.getpeername()[
                0]  # Must get peer_name before the content is read
        except AttributeError:  # On Windows there is no getpeername() Attribute of the class...
            try:
                peer_name = resp.raw._connection.sock.socket.getpeername()[0]
            except AttributeError:
                peer_name = 'None'  # Socket closed and could not derermine peername...
        return peer_name

    def _dummy_download_url(self, _):
        raise NotImplementedError

    def _download_url(self, url):
        if url in self.bad_urls:
            self._logger.log('DEBUG', 'Not downloading known bad URL:', url)
            return None

        if url in self.good_urls:  # This should not happen!
            self._logger.log(
                'ERROR',
                'Not downloading URL, because it is already downloaded in this session:',
                url)
            return None

        scheme, netloc, path, params, query, fragment = urlparse(url)
        # For safety urlencode the generated URL... (The URL might by modified in this step.)
        path = quote(path, safe='/%')
        url_reparsed = urlunparse(
            (scheme, netloc, path, params, query, fragment))

        try:  # The actual request (on the reparsed URL, everything else is made on the original URL)
            resp = self._requests_get(url_reparsed,
                                      headers=self._req_headers,
                                      stream=True,
                                      verify=self._verify_request)
        # UnicodeError is originated from idna codec error, LocationParseError is originated from URLlib3 error
        except (UnicodeError, RequestException, LocationParseError) as err:
            self._handle_request_exception(
                url, 'RequestException happened during downloading: {0} \n\n'
                ' The program ignores it and jumps to the next one.'.format(
                    err))
            return None

        if resp.status_code != 200:  # Not HTTP 200 OK
            self._handle_request_exception(
                url, 'Downloading failed with status code: {0} {1}'.format(
                    resp.status_code, resp.reason))
            return None

        # REQUEST (build headers for warc)
        reqv_headers = resp.request.headers
        reqv_headers['Host'] = netloc

        proto = 'HTTP/{0}'.format(
            respv_str[resp.raw.version])  # Friendly protocol name
        reqv_http_headers = StatusAndHeaders('GET {0} {1}'.format(
            urlunparse(('', '', path, params, query, fragment)), proto),
                                             reqv_headers.items(),
                                             is_http_request=True)
        reqv_record = self._writer.create_warc_record(
            url, 'request', http_headers=reqv_http_headers)

        # RESPONSE
        # resp_status need to be stripped else warcio strips the spaces and digest verification will fail!
        resp_status = '{0} {1}'.format(resp.status_code, resp.reason).strip()
        resp_headers_list = resp.raw.headers.items(
        )  # get raw headers from urllib3
        # Must get peer_name before the content is read
        peer_name = self._get_peer_name(resp)

        try:
            data = resp.raw.read(
            )  # To be able to return decoded and also write warc
        except ProtocolError as err:
            self._handle_request_exception(
                url, 'RequestException happened during downloading: {0} \n\n'
                ' The program ignores it and jumps to the next one.'.format(
                    err))
            return None

        if len(data) == 0:
            err = 'Response data has zero length!'
            self._handle_request_exception(
                url, 'RequestException happened during downloading: {0} \n\n'
                ' The program ignores it and jumps to the next one.'.format(
                    err))
            return None

        # warcio hack as \r\n is the record separator and trailing ones will be split and digest will eventually fail!
        if data.endswith(b'\r\n'):  # TODO: Warcio bugreport!
            data = data.rstrip()

        enc = resp.encoding  # Get or detect encoding to decode the bytes of the text to str
        if enc is None:
            enc = detect(data)['encoding']
        try:
            text = data.decode(enc)  # Normal decode process
        except UnicodeDecodeError:
            self._logger.log('WARNING',
                             'DECODE ERROR RETRYING IN \'IGNORE\' MODE:',
                             url,
                             enc,
                             sep='\t')
            text = data.decode(enc, 'ignore')
        data_stream = BytesIO(
            data
        )  # Need the original byte stream to write the payload to the warc file

        resp_http_headers = StatusAndHeaders(resp_status,
                                             resp_headers_list,
                                             protocol=proto)
        # Add extra headers like encoding because it is not stored any other way...
        resp_record = self._writer.create_warc_record(
            url,
            'response',
            payload=data_stream,
            http_headers=resp_http_headers,
            warc_headers_dict={
                'WARC-IP-Address': peer_name,
                'WARC-X-Detected-Encoding': enc
            })
        # Everything is OK, write the two WARC records
        self.write_record(reqv_record, url)
        self.write_record(resp_record, url)

        return text

    def write_record(self, record, url):
        self.good_urls.add(url)
        self._writer.write_record(record)
Beispiel #5
0
class HarParser(object):
    logger = logging.getLogger(__name__)

    def __init__(self, reader, writer, gzip=True):
        if isinstance(reader, str):
            with codecs.open(reader, encoding='utf-8') as fh:
                self.har = json.loads(fh.read())
        elif hasattr(reader, 'read'):
            self.har = json.loads(reader.read())
        elif isinstance(reader, dict):
            self.har = reader
        else:
            raise Exception('reader is in an unknown format')

        self.fh = None
        if isinstance(writer, BaseWARCWriter):
            self.writer = writer
        elif isinstance(writer, str):
            self.fh = open(writer, 'wb')
            self.writer = WARCWriter(self.fh, gzip=gzip)
        elif hasattr(writer, 'write'):
            self.writer = WARCWriter(writer, gzip=gzip)
        else:
            raise Exception('writer is in an unknown format')

    def parse(self, out_filename=None, rec_title=None):
        out_filename = out_filename or 'har.warc.gz'
        rec_title = rec_title or 'HAR Recording'
        metadata = self.create_wr_metadata(self.har['log'], rec_title)
        self.write_warc_info(self.har['log'], out_filename, metadata)

        for entry in self.har['log']['entries']:
            self.parse_entry(entry)

        if self.fh:
            self.fh.close()

    def parse_entry(self, entry):
        url = entry['request']['url']

        response = self.parse_response(url,
                                        entry['response'],
                                        entry.get('serverIPAddress'))

        #TODO: support WARC/1.1 arbitrary precision dates!
        warc_date = entry['startedDateTime'][:19] + 'Z'

        response.rec_headers.replace_header('WARC-Date', warc_date)

        request = self.parse_request(entry['request'])

        self.writer.write_request_response_pair(request, response)


    def create_wr_metadata(self, log, rec_title):
        pagelist = []

        for page in log['pages']:
            if not page['title'].startswith(('http:', 'https:')):
                continue

            pagelist.append(dict(title=page['title'],
                                 url=page['title'],
                                 timestamp=iso_date_to_timestamp(page['startedDateTime'])))

        metadata = {"title": rec_title,
                    "type": "recording",
                   }

        if pagelist:
            metadata["pages"] = pagelist

        return metadata

    def write_warc_info(self, log, filename, metadata):
        creator = '{0} {1}'.format(log['creator']['name'],
                                   log['creator']['version'])

        source = 'HAR Format {0}'.format(log['version'])

        software = 'har2warc ' + str(__version__)

        params = OrderedDict([('software', software),
                              ('creator', creator),
                              ('source', source),
                              ('format', 'WARC File Format 1.0'),
                              ('json-metadata', json.dumps(metadata))])

        record = self.writer.create_warcinfo_record(filename, params)
        self.writer.write_record(record)

    def _get_http_version(self, entry):
        http_version = entry.get('httpVersion')
        if not http_version or http_version.upper() not in ('HTTP/1.1', 'HTTP/1.0'):
            http_version = 'HTTP/1.1'

        return http_version

    def parse_response(self, url, response, ip=None):
        headers = []
        payload = BytesIO()
        content = response['content'].get('text', '')

        if not content and not response.get('headers'):
            self.logger.info('No headers or payload for: {0}'.format(url))
            headers.append(('Content-Length', '0'))
        if response['content'].get('encoding') == 'base64':
            payload.write(base64.b64decode(content))
        else:
            payload.write(content.encode('utf-8'))

        length = payload.tell()
        payload.seek(0)

        SKIP_HEADERS = ('content-encoding', 'transfer-encoding')

        http2 = False

        for header in response['headers']:
            if header['name'].lower() not in SKIP_HEADERS:
                headers.append((header['name'], header['value']))

            #TODO: http2 detection -- write as same warc header?
            if (not http2 and
                header['name'] in (':method', ':scheme', ':path')):
                http2 = True

        status = response.get('status') or 204

        reason = response.get('statusText')
        if not reason:
            reason = http_status_names.get(status, 'No Reason')

        status_line = str(status) + ' ' + reason

        proto = self._get_http_version(response)

        http_headers = StatusAndHeaders(status_line, headers, protocol=proto)

        if not content:
            content_length = http_headers.get_header('Content-Length', '0')
            if content_length != '0':
                self.logger.info('No Content for length {0} {1}'.format(content_length, url))
                http_headers.replace_header('Content-Length', '0')
        else:
            http_headers.replace_header('Content-Length', str(length))

        warc_headers_dict = {}
        if ip:
            warc_headers_dict['WARC-IP-Address'] = ip

        record = self.writer.create_warc_record(url, 'response',
                                                http_headers=http_headers,
                                                payload=payload,
                                                length=length,
                                                warc_headers_dict=warc_headers_dict)

        return record

    def parse_request(self, request):
        parts = urlsplit(request['url'])

        path = parts.path
        query = request.get('queryString')
        if query:
            path += '?' + urlencode(dict((p['name'], p['value'])
                                    for p in query))

        headers = []
        http2 = False

        for header in request['headers']:
            headers.append((header['name'], header['value']))

            #TODO: http2 detection -- write as same warc header?
            if (not http2 and
                header['name'] in (':method', ':scheme', ':path')):
                http2 = True

        if http2:
            headers.append(('Host', parts.netloc))

        http_version = self._get_http_version(request)

        status_line = request['method'] + ' ' + path + ' ' + http_version
        http_headers = StatusAndHeaders(status_line, headers)

        payload = None
        length = 0

        if request['bodySize'] > 0:
            payload = BytesIO()
            payload.write(request['postData']['text'].encode('utf-8'))
            length = payload.tell()
            payload.seek(0)

        record = self.writer.create_warc_record(request['url'], 'request',
                                                http_headers=http_headers,
                                                payload=payload,
                                                length=length)

        return record
Beispiel #6
0
class CCWARCWriter:
    def __init__(self,
                 prefix,
                 max_size,
                 subprefix=None,
                 gzip=True,
                 get_serial=None):
        self.writer = None
        self.prefix = prefix
        self.subprefix = subprefix
        self.max_size = max_size
        self.gzip = gzip
        self.hostname = socket.gethostname()
        if get_serial is not None:
            self.external_get_serial = get_serial
        else:
            self.external_get_serial = None
            self.serial = 0

    def __del__(self):
        if self.writer is not None:
            self.f.close()

    def create_default_info(self,
                            version,
                            warcheader_version,
                            ip,
                            description=None,
                            creator=None,
                            operator=None):
        '''
        creator:  # person, organization, service
        operator:  # person, if creator is an organization
        isPartOf:  # name of the crawl
        '''
        info = OrderedDict()

        info[
            'software'] = 'cocrawler/' + version + ' cocrawler_warcheader_version/' + warcheader_version
        info['hostname'] = self.hostname
        info['ip'] = ip
        if description:
            info['description'] = description
        if creator:
            info['creator'] = creator
        if operator:
            info['operator'] = operator
        info[
            'isPartOf'] = self.prefix  # intentionally does not include subprefix
        info['format'] = 'WARC file version 1.0'
        self.info = info
        return info

    def open(self):
        filename = self.prefix
        if self.subprefix:
            filename += '-' + str(
                self.subprefix)  # don't let yaml leave this as an int
        serial = self.get_serial(filename)
        filename += '-' + serial + '-' + self.hostname + '.warc'
        if self.gzip:
            filename += '.gz'
        self.filename = filename
        self.f = open(filename, 'wb')
        self.writer = WARCWriter(self.f, gzip=self.gzip)
        record = self.writer.create_warcinfo_record(self.filename, self.info)
        self.writer.write_record(record)

    def get_serial(self, filename):
        if self.external_get_serial is not None:
            return self.external_get_serial(filename)
        self.serial += 1
        return '{:06}'.format(self.serial - 1)

    def maybe_close(self):
        '''
        TODO: always close/reopen if subprefix is not None; to minimize open filehandles?
        '''
        fsize = os.fstat(self.f.fileno()).st_size
        if fsize > self.max_size:
            self.f.close()
            self.writer = None

    def write_dns(self, dns, ttl, url):
        # write it out even if empty
        # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse?

        # the response object doesn't contain the query type 'A' or 'AAAA'
        # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A'
        kind = 'A'  # fixme IPV6

        ttl = int(ttl)
        host = url.hostname

        if self.writer is None:
            self.open()

        payload = timestamp_now() + '\r\n'

        for r in dns:
            try:
                payload += '\t'.join(
                    (host + '.', str(ttl), 'IN', kind, r['host'])) + '\r\n'
            except Exception as e:
                LOGGER.info('problem converting dns reply for warcing', host,
                            r, e)
                pass
        payload = payload.encode('utf-8')

        record = self.writer.create_warc_record('dns:' + host,
                                                'resource',
                                                payload=BytesIO(payload),
                                                warc_content_type='text/dns',
                                                length=len(payload))

        self.writer.write_record(record)
        LOGGER.debug('wrote warc dns response record%s for host %s',
                     p(self.prefix), host)
        stats.stats_sum('warc dns' + p(self.prefix), 1)

    def _fake_resp_headers(self, resp_headers, body_len, decompressed=False):
        prefix = b'X-Crawler-'
        ret = []
        for h, v in resp_headers:
            hl = h.lower()
            if hl == b'content-length':
                if not (v.isdigit() and int(v) == body_len):
                    ret.append((prefix + h, v))
                    ret.append((b'Content-Length', str(body_len)))
            elif hl == b'content-encoding':
                if decompressed:
                    ret.append((prefix + h, v))
                else:
                    ret.append((h, v))
            elif hl == b'transfer-encoding':
                if v.lower() == b'chunked':
                    # aiohttp always undoes chunking
                    ret.append((prefix + h, v))
                else:
                    ret.append((h, v))
            else:
                ret.append((h, v))
        return ret

    def write_request_response_pair(self,
                                    url,
                                    ip,
                                    req_headers,
                                    resp_headers,
                                    is_truncated,
                                    payload,
                                    digest=None,
                                    decompressed=False):
        if self.writer is None:
            self.open()

        req_http_headers = StatusAndHeaders('GET / HTTP/1.1', req_headers)

        request = self.writer.create_warc_record('http://example.com/',
                                                 'request',
                                                 http_headers=req_http_headers)

        fake_resp_headers = self._fake_resp_headers(resp_headers,
                                                    len(payload),
                                                    decompressed=decompressed)
        resp_http_headers = StatusAndHeaders('200 OK',
                                             fake_resp_headers,
                                             protocol='HTTP/1.1')

        warc_headers_dict = OrderedDict()
        if ip is not None:
            # ip should be here unless we crawl through a proxy
            warc_headers_dict['WARC-IP-Address'] = ip
        if digest is not None:
            warc_headers_dict['WARC-Payload-Digest'] = digest
        if is_truncated:
            if is_truncated in valid_truncations:
                warc_headers_dict['WARC-Truncated'] = is_truncated
            else:
                LOGGER.error('Invalid is_truncation of ' + is_truncated)
                warc_headers_dict['WARC-Truncated'] = 'unspecified'

        response = self.writer.create_warc_record(
            url,
            'response',
            payload=BytesIO(payload),
            length=len(payload),
            warc_headers_dict=warc_headers_dict,
            http_headers=resp_http_headers)

        self.writer.write_request_response_pair(request, response)
        self.maybe_close()
        LOGGER.debug('wrote warc request-response pair%s for url %s',
                     p(self.prefix), url)
        stats.stats_sum('warc r/r' + p(self.prefix), 1)
Beispiel #7
0
class CCWARCWriter:
    def __init__(self,
                 prefix,
                 max_size,
                 subprefix=None,
                 gzip=True,
                 get_serial=None):
        self.writer = None
        self.prefix = prefix
        self.subprefix = subprefix
        self.max_size = max_size
        self.gzip = gzip
        self.hostname = socket.gethostname()
        if get_serial is not None:
            self.external_get_serial = get_serial
        else:
            self.external_get_serial = None
            self.serial = 0

    def __del__(self):
        if self.writer is not None:
            self.f.close()

    def create_default_info(self,
                            version,
                            ip,
                            description=None,
                            creator=None,
                            operator=None):
        '''
        creator:  # person, organization, service
        operator:  # person, if creator is an organization
        isPartOf:  # name of the crawl
        '''
        info = OrderedDict()

        info['software'] = 'cocrawler/' + version
        info['hostname'] = self.hostname
        info['ip'] = ip
        if description:
            info['description'] = description
        if creator:
            info['creator'] = creator
        if operator:
            info['operator'] = operator
        info[
            'isPartOf'] = self.prefix  # intentionally does not include subprefix
        info['format'] = 'WARC file version 1.0'
        self.info = info
        return info

    def open(self):
        filename = self.prefix
        if self.subprefix:
            filename += '-' + self.subprefix
        serial = self.get_serial(filename)
        filename += '-' + serial + '-' + self.hostname + '.warc'
        if self.gzip:
            filename += '.gz'
        self.filename = filename
        self.f = open(filename, 'wb')
        self.writer = WARCWriter(self.f, gzip=self.gzip)
        record = self.writer.create_warcinfo_record(self.filename, self.info)
        self.writer.write_record(record)

    def get_serial(self, filename):
        if self.external_get_serial is not None:
            return self.external_get_serial(filename)
        self.serial += 1
        return '{:06}'.format(self.serial - 1)

    def maybe_close(self):
        '''
        TODO: always close/reopen if subprefix is not None; minimizes open filehandles
        '''
        fsize = os.fstat(self.f.fileno()).st_size
        if fsize > self.max_size:
            self.f.close()
            self.writer = None

    def write_dns(self, dns, expires, url):
        # write it out even if empty
        # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse?

        # the response object doesn't contain the query type 'A' or 'AAAA'
        # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A'
        kind = 'A'  # fixme IPV6

        ttl = int(expires - time.time())
        host = url.hostname

        if self.writer is None:
            self.open()

        payload = timestamp_now() + '\r\n'

        for r in dns:
            try:
                payload += host + '.\t' + str(
                    ttl) + '\tIN\t' + kind + '\t' + r['host'] + '\r\n'
            except Exception as e:
                LOGGER.info('problem converting dns reply for warcing', host,
                            r, e)
                pass
        payload = payload.encode('utf-8')

        record = self.writer.create_warc_record('dns:' + host,
                                                'resource',
                                                payload=BytesIO(payload),
                                                warc_content_type='text/dns',
                                                length=len(payload))

        self.writer.write_record(record)
        LOGGER.debug('wrote warc dns response record%s for host %s',
                     p(self.prefix), host)
        stats.stats_sum('warc dns' + p(self.prefix), 1)

    def write_request_response_pair(self,
                                    url,
                                    req_headers,
                                    resp_headers,
                                    is_truncated,
                                    payload,
                                    digest=None):
        if self.writer is None:
            self.open()

        # XXX WARC-Identified-Payload-Type set from Apache Tika? (done by Common Crawl) (how expensive?)

        req_http_headers = StatusAndHeaders(
            'GET / HTTP/1.1', headers_to_str_headers(req_headers))

        request = self.writer.create_warc_record('http://example.com/',
                                                 'request',
                                                 http_headers=req_http_headers)

        resp_http_headers = StatusAndHeaders(
            '200 OK',
            headers_to_str_headers(resp_headers),
            protocol='HTTP/1.1')

        warc_headers_dict = {}
        if digest is not None:
            warc_headers_dict['WARC-Payload-Digest'] = digest
        if is_truncated:
            if is_truncated in valid_truncations:
                warc_headers_dict['WARC-Truncated'] = is_truncated
            else:
                LOGGER.error('Invalid is_truncation of ' + is_truncated)
                warc_headers_dict['WARC-Truncated'] = 'unspecified'

        response = self.writer.create_warc_record(
            url,
            'response',
            payload=BytesIO(payload),
            length=len(payload),
            warc_headers_dict=warc_headers_dict,
            http_headers=resp_http_headers)

        self.writer.write_request_response_pair(request, response)
        self.maybe_close()
        LOGGER.debug('wrote warc request-response pair%s for url %s',
                     p(self.prefix), url)
        stats.stats_sum('warc r/r' + p(self.prefix), 1)
Beispiel #8
0
class WarcDownloader:
    """
        Download URL with HTTP GET, save to a WARC file and return the decoded text
    """
    def __init__(self, filename, logger_, program_name='corpusbuilder 1.0', user_agent=None, overwrite_warc=True,
                 err_threshold=10, warcinfo_record_data=None, known_bad_urls=None, max_no_of_calls_in_period=2,
                 limit_period=1, proxy_url=None, allow_cookies=False):
        if known_bad_urls is not None:  # Setup the list of cached bad URLs to prevent trying to download them again
            with open(known_bad_urls, encoding='UTF-8') as fh:
                self.bad_urls = {line.strip() for line in fh}
        else:
            self.bad_urls = set()

        if not overwrite_warc:  # Find out next nonexisting warc filename
            num = 0
            while os.path.exists(filename):
                filename2, ext = os.path.splitext(filename)  # Should be filename.warc.gz
                if ext == '.gz' and filename2.endswith('.warc'):
                    filename2, ext2 = os.path.splitext(filename2)  # Should be filename.warc
                    ext = ext2 + ext  # Should be .warc.gz

                filename = '{0}-{1:05d}{2}'.format(filename2, num, ext)
                num += 1

        logger_.log('INFO', 'Creating archivefile: {0}'.format(filename))

        self._output_file = open(filename, 'wb')
        self._logger_ = logger_
        self._req_headers = {'Accept-Encoding': 'identity', 'User-agent': user_agent}

        self._session = Session()  # Setup session for speeding up downloads

        if proxy_url is not None:  # Set socks proxy if provided
            self._session.proxies['http'] = proxy_url
            self._session.proxies['https'] = proxy_url

        self._allow_cookies = allow_cookies

        # Setup rate limiting to prevent hammering the server
        self._requests_get = sleep_and_retry(limits(calls=max_no_of_calls_in_period,
                                                    period=limit_period)(self._http_get_w_cookie_handling))
        self._error_count = 0
        self._error_threshold = err_threshold  # Set the error threshold which cause aborting to prevent deinal

        self._writer = WARCWriter(self._output_file, gzip=True)
        if warcinfo_record_data is None:
            # INFO RECORD
            # Some custom information about the warc writer program and its settings
            info_headers = {'software': program_name, 'arguments': ' '.join(sys.argv[1:]),
                            'format': 'WARC File Format 1.0',
                            'conformsTo': 'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf'}
            info_record = self._writer.create_warcinfo_record(filename, info_headers)
        else:  # Must recreate custom headers else they will not be copied
            custom_headers = ''.join('{0}: {1}\r\n'.format(k, v) for k, v in warcinfo_record_data[1].items()).\
                             encode('UTF-8')
            info_record = self._writer.create_warc_record('', 'warcinfo', warc_headers=warcinfo_record_data[0],
                                                          payload=BytesIO(custom_headers),
                                                          length=len(custom_headers))
        self._writer.write_record(info_record)

    def __del__(self):
        if hasattr(self, '_output_file'):  # If the program opened a file, then it should gracefully close it on exit!
            self._output_file.close()

    def _http_get_w_cookie_handling(self, *args, **kwargs):
        """
            Extend requests.get with optional cookie purging
        """
        if not self._allow_cookies:
            self._session.cookies.clear()
        return self._session.get(*args, **kwargs)

    def _handle_request_exception(self, url, msg):
        self._logger_.log('WARNING', '\t'.join((url, msg)))

        self._error_count += 1
        if self._error_count >= self._error_threshold:
            raise NameError('Too many error happened! Threshold exceeded! See log for details!')

    def download_url(self, url):
        scheme, netloc, path, params, query, fragment = urlparse(url)
        path = quote(path)  # For safety urlencode the generated URL...
        url = urlunparse((scheme, netloc, path, params, query, fragment))

        if url in self.bad_urls:
            self._logger_.log('INFO', 'Not downloading known bad URL: {0}'.format(url))
            return None

        try:  # The actual request
            resp = self._requests_get(url, headers=self._req_headers, stream=True)
        except RequestException as err:
            self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n'
                                                ' The program ignores it and jumps to the next one.'.format(err))
            return None

        if resp.status_code != 200:  # Not HTTP 200 OK
            self._handle_request_exception(url, 'Downloading failed with status code: {0} {1}'.format(resp.status_code,
                                                                                                      resp.reason))
            return None

        # REQUEST
        reqv_headers = resp.request.headers
        reqv_headers['Host'] = netloc

        proto = 'HTTP/{0}'.format(respv_str[resp.raw.version])  # Friendly protocol name
        reqv_http_headers = StatusAndHeaders('GET {0} {1}'.format(urlunparse(('', '', path, params, query, fragment)),
                                                                  proto), reqv_headers.items(), is_http_request=True)
        reqv_record = self._writer.create_warc_record(url, 'request', http_headers=reqv_http_headers)

        # RESPONSE
        resp_status = '{0} {1}'.format(resp.status_code, resp.reason)
        resp_headers_list = resp.raw.headers.items()  # get raw headers from urllib3
        # Must get peer_name before the content is read
        # It has no official API for that:
        # https://github.com/kennethreitz/requests/issues/2158
        # https://github.com/urllib3/urllib3/issues/1071
        # So workaround to be compatible with windows:
        # https://stackoverflow.com/questions/22492484/how-do-i-get-the-ip-address-from-a-http-request-using-the-\
        # requests-library/22513161#22513161
        try:
            peer_name = resp.raw._connection.sock.getpeername()[0]  # Must get peer_name before the content is read
        except AttributeError:  # On Windows there is no getpeername() Attribute of the class...
            try:
                peer_name = resp.raw._connection.sock.socket.getpeername()[0]
            except AttributeError:
                peer_name = 'None'  # Socket closed and could not derermine peername...

        try:
            data = resp.raw.read()  # To be able to return decoded and also write warc
        except ProtocolError as err:
            self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n'
                                                ' The program ignores it and jumps to the next one.'.format(err))
            return None

        if len(data) == 0:
            err = 'Response data has zero length!'
            self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n'
                                                ' The program ignores it and jumps to the next one.'.format(err))
            return None

        enc = resp.encoding  # Get or detect encoding to decode the bytes of the text to str
        if enc is None:
            enc = detect(data)['encoding']
        try:
            text = data.decode(enc)  # Normal decode process
        except UnicodeDecodeError:
            self._logger_.log('WARNING', '\t'.join(('DECODE ERROR RETRYING IN \'IGNORE\' MODE:', url, enc)))
            text = data.decode(enc, 'ignore')
        data_stream = BytesIO(data)  # Need the original byte stream to write the payload to the warc file

        resp_http_headers = StatusAndHeaders(resp_status, resp_headers_list, protocol=proto)
        # Add extra headers like encoding because it is not stored any other way...
        resp_record = self._writer.create_warc_record(url, 'response', payload=data_stream,
                                                      http_headers=resp_http_headers,
                                                      warc_headers_dict={'WARC-IP-Address': peer_name,
                                                                         'WARC-X-Detected-Encoding': enc})
        # Everything is OK, write the two WARC records
        self._writer.write_record(reqv_record)
        self._writer.write_record(resp_record)

        return text

    def write_record(self, record):
        self._writer.write_record(record)