def build_writer(self): """ Initialize a new WARC file and write the "warcinfo" header. """ directory = self.settings.get('WARC_FILE_DIRECTORY', '.') filename = self.build_filename() if self.debug: fp = sys.stdout.buffer else: fp = open(os.path.join(directory, filename), 'wb') logger.debug(f"Generating WARC file {filename}") writer = WARCWriter( fp, gzip=self.settings.getbool('WARC_GZIP', True), warc_version=self.settings['WARC_VERSION'], ) headers = { 'hostname': self.hostname, 'ip': self.ip_address, 'http-header-user-agent': self.settings["USER_AGENT"], 'robots': 'classic' if self.settings["ROBOTSTXT_OBEY"] else 'none', 'operator': self.settings.get("WARC_OPERATOR"), 'software': self.settings.get("WARC_SOFTWARE"), 'isPartOf': self.settings.get("WARC_IS_PART_OF"), 'description': self.settings.get("WARC_DESCRIPTION"), 'format': self.settings.get("WARC_FORMAT"), 'conformsTo': self.settings.get("WARC_CONFORMS_TO"), } warcinfo_record = writer.create_warcinfo_record(filename, headers) writer.write_record(warcinfo_record) return writer
fo = WARCWriter(open(options.output, 'wb'), gzip=True) if options.pdfpass is not None: po = WARCWriter(open(options.pdfpass, 'wb'), gzip=True) if not options.pdfpass and options.pdfextract: extractor = ExtrP() cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False) if options.output == sys.stdout: filename = options.input else: filename = options.output fo.write_record(fo.create_warcinfo_record(filename=filename, info={'software': 'bitextor/bitextor-warc2htmlwarc.py', 'format': 'WARC File Format 1.0'})) for record in f: # Initial checks if record.rec_type != 'response' and record.rec_type != 'resource': continue if record.rec_headers.get_header('WARC-Target-URI')[0] == '<' and record.rec_headers.get_header('WARC-Target-URI')[-1] == '>': url = record.rec_headers.get_header('WARC-Target-URI')[1:-1] else: url = record.rec_headers.get_header('WARC-Target-URI') if url == "unknown": logging.info("Skipping page with unknown URL") continue if "text/dns" in record.rec_headers.get_header('Content-Type'): continue pageSize = int(record.rec_headers.get_header('Content-Length'))
class MHTML2WARC: logger = logging.getLogger(__name__) def __init__(self, writer, gzip=True): self.fh = None self.writer = None self.filename = 'unknown' self.is_first = True if isinstance(writer, BaseWARCWriter): self.writer = writer elif isinstance(writer, str): self.fh = open(writer, 'wb') self.filename = writer self.writer = WARCWriter(self.fh, gzip=gzip) elif hasattr(writer, 'write'): self.writer = WARCWriter(writer, gzip=gzip) else: raise Exception('writer is in an unknown format') def parse(self, input_): if isinstance(input_, str): with open(input_, 'rb') as rfh: message = email.message_from_binary_file(rfh, policy=email.policy.strict) elif hasattr(input_, 'read'): message = email.message_from_binary_file(input_, policy=email.policy.strict) else: raise Exception('input is in an unknown format') if not message.is_multipart(): raise Exception('Invalid MHTML -- not multipart') main_url = message.get('Snapshot-Content-Location', '') warc_date = self.write_warc_info(message) for part in message.walk(): if part.get_content_type() == 'multipart/related': continue self.write_resource(part, main_url, warc_date) def write_resource(self, part, main_url, warc_date): content_type = part.get_content_type() main_type = part.get_content_maintype() content = part.get_payload(decode=True) url = part.get('Content-Location') warc_headers = {'WARC-Date': warc_date, 'WARC-Creation-Date': self.writer._make_warc_date(), } content_id = part.get('Content-ID') write_redir = False if content_id: warc_headers['Content-ID'] = content_id cid_url = 'cid:' + content_id[1:-1] # only write main page url once under url # there may be additional frames for same url # only write them under cid if url == main_url: if self.is_first: self.is_first = False else: url = None if not url: # if cid urls not allowed, skip this resource if not allow_cid_urls: return url = cid_url else: write_redir = True record = self.writer.create_warc_record(url, 'resource', payload=BytesIO(content), length=len(content), warc_content_type=content_type, warc_headers_dict=warc_headers) self.writer.write_record(record) if write_redir and allow_cid_urls: self.add_cid_redirect(cid_url, url) def add_cid_redirect(self, cid_url, url): msg = b'redirect' headers_list = [('Content-Type', 'text/plain'), ('Content-Length', str(len(msg))), ('Location', url)] http_headers = StatusAndHeaders('302 Redirect', headers_list, protocol='HTTP/1.0') record = self.writer.create_warc_record(cid_url, 'response', length=len(msg), payload=BytesIO(msg), http_headers=http_headers) self.writer.write_record(record) def write_warc_info(self, message): creator = message.get('From', '') url = message.get('Snapshot-Content-Location', '') title = message.get('Subject', url) try: actual_date = http_date_to_datetime(message['Date']) timestamp = datetime_to_timestamp(actual_date) except Exception: actual_date = '' timestamp = '' source = 'MHTML Snapshot for: ' + url software = 'mhtml2warc ' + str(__version__) metadata = {'title': source, 'type': 'recording', 'pages': [{'title': title, 'url': url, 'timestamp': timestamp}] } params = OrderedDict([('software', software), ('creator', creator), ('source', source), ('format', 'WARC File Format 1.0'), ('subject', title), ('json-metadata', json.dumps(metadata))]) record = self.writer.create_warcinfo_record(self.filename, params) if actual_date: actual_date = datetime_to_iso_date(actual_date) creation_date = record.rec_headers.get('WARC-Date') record.rec_headers.replace_header('WARC-Date', actual_date) record.rec_headers.replace_header('WARC-Creation-Date', creation_date) self.writer.write_record(record) return actual_date
class WarcDownloader: """ Download URL with HTTP GET, save to a WARC file and return the decoded text """ def __init__(self, expected_filename, _logger, warcinfo_record_data=None, program_name='WebArticleCurator', user_agent=None, overwrite_warc=True, err_threshold=10, known_bad_urls=None, max_no_of_calls_in_period=2, limit_period=1, proxy_url=None, allow_cookies=False, verify_request=True, stay_offline=False): # Store variables self._logger = _logger self._req_headers = { 'Accept-Encoding': 'identity', 'User-agent': user_agent } self._error_count = 0 self._error_threshold = err_threshold # Set the error threshold which cause aborting to prevent deinal # Setup download function if not stay_offline: self.download_url = self._download_url else: self.download_url = self._dummy_download_url if known_bad_urls is not None: # Setup the list of cached bad URLs to prevent trying to download them again with open(known_bad_urls, encoding='UTF-8') as fh: self.bad_urls = {line.strip() for line in fh} else: self.bad_urls = set() self.good_urls = set() # Setup target file handle filename = self._set_target_filename(expected_filename, overwrite_warc) self._logger.log('INFO', 'Creating archivefile:', filename) self._output_file = open(filename, 'wb') self._session = Session() # Setup session for speeding up downloads if proxy_url is not None: # Set socks proxy if provided self._session.proxies['http'] = proxy_url self._session.proxies['https'] = proxy_url self._allow_cookies = allow_cookies self._verify_request = verify_request if not self._verify_request: disable_warnings(InsecureRequestWarning) # Setup rate limiting to prevent hammering the server self._requests_get = sleep_and_retry( limits(calls=max_no_of_calls_in_period, period=limit_period)(self._http_get_w_cookie_handling)) self._writer = WARCWriter(self._output_file, gzip=True, warc_version='WARC/1.1') if warcinfo_record_data is None: # Or use the parsed else custom headers will not be copied # INFO RECORD # Some custom information about the warc writer program and its settings warcinfo_record_data = { 'software': program_name, 'arguments': ' '.join(sys.argv[1:]), 'format': 'WARC File Format 1.1', 'conformsTo': 'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1-1_latestdraft.pdf' } info_record = self._writer.create_warcinfo_record( filename, warcinfo_record_data) self._writer.write_record(info_record) @staticmethod def _set_target_filename(filename, overwrite_warc): if not overwrite_warc: # Find out next nonexisting warc filename num = 0 while os.path.exists(filename): filename2, ext = os.path.splitext( filename) # Should be filename.warc.gz if ext == '.gz' and filename2.endswith('.warc'): filename2, ext2 = os.path.splitext( filename2) # Should be filename.warc ext = ext2 + ext # Should be .warc.gz filename = '{0}-{1:05d}{2}'.format(filename2, num, ext) num += 1 return filename def __del__(self): if hasattr( self, '_output_file' ): # If the program opened a file, then it should gracefully close it on exit! self._output_file.close() def _http_get_w_cookie_handling(self, *args, **kwargs): """ Extend requests.get with optional cookie purging """ if not self._allow_cookies: self._session.cookies.clear() return self._session.get(*args, **kwargs) def _handle_request_exception(self, url, msg): self._logger.log('WARNING', url, msg, sep='\t') self._error_count += 1 if self._error_count >= self._error_threshold: raise NameError( 'Too many error happened! Threshold exceeded! See log for details!' ) @staticmethod def _get_peer_name(resp): # Must get peer_name before the content is read # It has no official API for that: # https://github.com/kennethreitz/requests/issues/2158 # https://github.com/urllib3/urllib3/issues/1071 # So workaround to be compatible with windows: # https://stackoverflow.com/questions/22492484/how-do-i-get-the-ip-address-from-a-http-request-using-the-\ # requests-library/22513161#22513161 try: peer_name = resp.raw._connection.sock.getpeername()[ 0] # Must get peer_name before the content is read except AttributeError: # On Windows there is no getpeername() Attribute of the class... try: peer_name = resp.raw._connection.sock.socket.getpeername()[0] except AttributeError: peer_name = 'None' # Socket closed and could not derermine peername... return peer_name def _dummy_download_url(self, _): raise NotImplementedError def _download_url(self, url): if url in self.bad_urls: self._logger.log('DEBUG', 'Not downloading known bad URL:', url) return None if url in self.good_urls: # This should not happen! self._logger.log( 'ERROR', 'Not downloading URL, because it is already downloaded in this session:', url) return None scheme, netloc, path, params, query, fragment = urlparse(url) # For safety urlencode the generated URL... (The URL might by modified in this step.) path = quote(path, safe='/%') url_reparsed = urlunparse( (scheme, netloc, path, params, query, fragment)) try: # The actual request (on the reparsed URL, everything else is made on the original URL) resp = self._requests_get(url_reparsed, headers=self._req_headers, stream=True, verify=self._verify_request) # UnicodeError is originated from idna codec error, LocationParseError is originated from URLlib3 error except (UnicodeError, RequestException, LocationParseError) as err: self._handle_request_exception( url, 'RequestException happened during downloading: {0} \n\n' ' The program ignores it and jumps to the next one.'.format( err)) return None if resp.status_code != 200: # Not HTTP 200 OK self._handle_request_exception( url, 'Downloading failed with status code: {0} {1}'.format( resp.status_code, resp.reason)) return None # REQUEST (build headers for warc) reqv_headers = resp.request.headers reqv_headers['Host'] = netloc proto = 'HTTP/{0}'.format( respv_str[resp.raw.version]) # Friendly protocol name reqv_http_headers = StatusAndHeaders('GET {0} {1}'.format( urlunparse(('', '', path, params, query, fragment)), proto), reqv_headers.items(), is_http_request=True) reqv_record = self._writer.create_warc_record( url, 'request', http_headers=reqv_http_headers) # RESPONSE # resp_status need to be stripped else warcio strips the spaces and digest verification will fail! resp_status = '{0} {1}'.format(resp.status_code, resp.reason).strip() resp_headers_list = resp.raw.headers.items( ) # get raw headers from urllib3 # Must get peer_name before the content is read peer_name = self._get_peer_name(resp) try: data = resp.raw.read( ) # To be able to return decoded and also write warc except ProtocolError as err: self._handle_request_exception( url, 'RequestException happened during downloading: {0} \n\n' ' The program ignores it and jumps to the next one.'.format( err)) return None if len(data) == 0: err = 'Response data has zero length!' self._handle_request_exception( url, 'RequestException happened during downloading: {0} \n\n' ' The program ignores it and jumps to the next one.'.format( err)) return None # warcio hack as \r\n is the record separator and trailing ones will be split and digest will eventually fail! if data.endswith(b'\r\n'): # TODO: Warcio bugreport! data = data.rstrip() enc = resp.encoding # Get or detect encoding to decode the bytes of the text to str if enc is None: enc = detect(data)['encoding'] try: text = data.decode(enc) # Normal decode process except UnicodeDecodeError: self._logger.log('WARNING', 'DECODE ERROR RETRYING IN \'IGNORE\' MODE:', url, enc, sep='\t') text = data.decode(enc, 'ignore') data_stream = BytesIO( data ) # Need the original byte stream to write the payload to the warc file resp_http_headers = StatusAndHeaders(resp_status, resp_headers_list, protocol=proto) # Add extra headers like encoding because it is not stored any other way... resp_record = self._writer.create_warc_record( url, 'response', payload=data_stream, http_headers=resp_http_headers, warc_headers_dict={ 'WARC-IP-Address': peer_name, 'WARC-X-Detected-Encoding': enc }) # Everything is OK, write the two WARC records self.write_record(reqv_record, url) self.write_record(resp_record, url) return text def write_record(self, record, url): self.good_urls.add(url) self._writer.write_record(record)
class HarParser(object): logger = logging.getLogger(__name__) def __init__(self, reader, writer, gzip=True): if isinstance(reader, str): with codecs.open(reader, encoding='utf-8') as fh: self.har = json.loads(fh.read()) elif hasattr(reader, 'read'): self.har = json.loads(reader.read()) elif isinstance(reader, dict): self.har = reader else: raise Exception('reader is in an unknown format') self.fh = None if isinstance(writer, BaseWARCWriter): self.writer = writer elif isinstance(writer, str): self.fh = open(writer, 'wb') self.writer = WARCWriter(self.fh, gzip=gzip) elif hasattr(writer, 'write'): self.writer = WARCWriter(writer, gzip=gzip) else: raise Exception('writer is in an unknown format') def parse(self, out_filename=None, rec_title=None): out_filename = out_filename or 'har.warc.gz' rec_title = rec_title or 'HAR Recording' metadata = self.create_wr_metadata(self.har['log'], rec_title) self.write_warc_info(self.har['log'], out_filename, metadata) for entry in self.har['log']['entries']: self.parse_entry(entry) if self.fh: self.fh.close() def parse_entry(self, entry): url = entry['request']['url'] response = self.parse_response(url, entry['response'], entry.get('serverIPAddress')) #TODO: support WARC/1.1 arbitrary precision dates! warc_date = entry['startedDateTime'][:19] + 'Z' response.rec_headers.replace_header('WARC-Date', warc_date) request = self.parse_request(entry['request']) self.writer.write_request_response_pair(request, response) def create_wr_metadata(self, log, rec_title): pagelist = [] for page in log['pages']: if not page['title'].startswith(('http:', 'https:')): continue pagelist.append(dict(title=page['title'], url=page['title'], timestamp=iso_date_to_timestamp(page['startedDateTime']))) metadata = {"title": rec_title, "type": "recording", } if pagelist: metadata["pages"] = pagelist return metadata def write_warc_info(self, log, filename, metadata): creator = '{0} {1}'.format(log['creator']['name'], log['creator']['version']) source = 'HAR Format {0}'.format(log['version']) software = 'har2warc ' + str(__version__) params = OrderedDict([('software', software), ('creator', creator), ('source', source), ('format', 'WARC File Format 1.0'), ('json-metadata', json.dumps(metadata))]) record = self.writer.create_warcinfo_record(filename, params) self.writer.write_record(record) def _get_http_version(self, entry): http_version = entry.get('httpVersion') if not http_version or http_version.upper() not in ('HTTP/1.1', 'HTTP/1.0'): http_version = 'HTTP/1.1' return http_version def parse_response(self, url, response, ip=None): headers = [] payload = BytesIO() content = response['content'].get('text', '') if not content and not response.get('headers'): self.logger.info('No headers or payload for: {0}'.format(url)) headers.append(('Content-Length', '0')) if response['content'].get('encoding') == 'base64': payload.write(base64.b64decode(content)) else: payload.write(content.encode('utf-8')) length = payload.tell() payload.seek(0) SKIP_HEADERS = ('content-encoding', 'transfer-encoding') http2 = False for header in response['headers']: if header['name'].lower() not in SKIP_HEADERS: headers.append((header['name'], header['value'])) #TODO: http2 detection -- write as same warc header? if (not http2 and header['name'] in (':method', ':scheme', ':path')): http2 = True status = response.get('status') or 204 reason = response.get('statusText') if not reason: reason = http_status_names.get(status, 'No Reason') status_line = str(status) + ' ' + reason proto = self._get_http_version(response) http_headers = StatusAndHeaders(status_line, headers, protocol=proto) if not content: content_length = http_headers.get_header('Content-Length', '0') if content_length != '0': self.logger.info('No Content for length {0} {1}'.format(content_length, url)) http_headers.replace_header('Content-Length', '0') else: http_headers.replace_header('Content-Length', str(length)) warc_headers_dict = {} if ip: warc_headers_dict['WARC-IP-Address'] = ip record = self.writer.create_warc_record(url, 'response', http_headers=http_headers, payload=payload, length=length, warc_headers_dict=warc_headers_dict) return record def parse_request(self, request): parts = urlsplit(request['url']) path = parts.path query = request.get('queryString') if query: path += '?' + urlencode(dict((p['name'], p['value']) for p in query)) headers = [] http2 = False for header in request['headers']: headers.append((header['name'], header['value'])) #TODO: http2 detection -- write as same warc header? if (not http2 and header['name'] in (':method', ':scheme', ':path')): http2 = True if http2: headers.append(('Host', parts.netloc)) http_version = self._get_http_version(request) status_line = request['method'] + ' ' + path + ' ' + http_version http_headers = StatusAndHeaders(status_line, headers) payload = None length = 0 if request['bodySize'] > 0: payload = BytesIO() payload.write(request['postData']['text'].encode('utf-8')) length = payload.tell() payload.seek(0) record = self.writer.create_warc_record(request['url'], 'request', http_headers=http_headers, payload=payload, length=length) return record
class CCWARCWriter: def __init__(self, prefix, max_size, subprefix=None, gzip=True, get_serial=None): self.writer = None self.prefix = prefix self.subprefix = subprefix self.max_size = max_size self.gzip = gzip self.hostname = socket.gethostname() if get_serial is not None: self.external_get_serial = get_serial else: self.external_get_serial = None self.serial = 0 def __del__(self): if self.writer is not None: self.f.close() def create_default_info(self, version, warcheader_version, ip, description=None, creator=None, operator=None): ''' creator: # person, organization, service operator: # person, if creator is an organization isPartOf: # name of the crawl ''' info = OrderedDict() info[ 'software'] = 'cocrawler/' + version + ' cocrawler_warcheader_version/' + warcheader_version info['hostname'] = self.hostname info['ip'] = ip if description: info['description'] = description if creator: info['creator'] = creator if operator: info['operator'] = operator info[ 'isPartOf'] = self.prefix # intentionally does not include subprefix info['format'] = 'WARC file version 1.0' self.info = info return info def open(self): filename = self.prefix if self.subprefix: filename += '-' + str( self.subprefix) # don't let yaml leave this as an int serial = self.get_serial(filename) filename += '-' + serial + '-' + self.hostname + '.warc' if self.gzip: filename += '.gz' self.filename = filename self.f = open(filename, 'wb') self.writer = WARCWriter(self.f, gzip=self.gzip) record = self.writer.create_warcinfo_record(self.filename, self.info) self.writer.write_record(record) def get_serial(self, filename): if self.external_get_serial is not None: return self.external_get_serial(filename) self.serial += 1 return '{:06}'.format(self.serial - 1) def maybe_close(self): ''' TODO: always close/reopen if subprefix is not None; to minimize open filehandles? ''' fsize = os.fstat(self.f.fileno()).st_size if fsize > self.max_size: self.f.close() self.writer = None def write_dns(self, dns, ttl, url): # write it out even if empty # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse? # the response object doesn't contain the query type 'A' or 'AAAA' # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A' kind = 'A' # fixme IPV6 ttl = int(ttl) host = url.hostname if self.writer is None: self.open() payload = timestamp_now() + '\r\n' for r in dns: try: payload += '\t'.join( (host + '.', str(ttl), 'IN', kind, r['host'])) + '\r\n' except Exception as e: LOGGER.info('problem converting dns reply for warcing', host, r, e) pass payload = payload.encode('utf-8') record = self.writer.create_warc_record('dns:' + host, 'resource', payload=BytesIO(payload), warc_content_type='text/dns', length=len(payload)) self.writer.write_record(record) LOGGER.debug('wrote warc dns response record%s for host %s', p(self.prefix), host) stats.stats_sum('warc dns' + p(self.prefix), 1) def _fake_resp_headers(self, resp_headers, body_len, decompressed=False): prefix = b'X-Crawler-' ret = [] for h, v in resp_headers: hl = h.lower() if hl == b'content-length': if not (v.isdigit() and int(v) == body_len): ret.append((prefix + h, v)) ret.append((b'Content-Length', str(body_len))) elif hl == b'content-encoding': if decompressed: ret.append((prefix + h, v)) else: ret.append((h, v)) elif hl == b'transfer-encoding': if v.lower() == b'chunked': # aiohttp always undoes chunking ret.append((prefix + h, v)) else: ret.append((h, v)) else: ret.append((h, v)) return ret def write_request_response_pair(self, url, ip, req_headers, resp_headers, is_truncated, payload, digest=None, decompressed=False): if self.writer is None: self.open() req_http_headers = StatusAndHeaders('GET / HTTP/1.1', req_headers) request = self.writer.create_warc_record('http://example.com/', 'request', http_headers=req_http_headers) fake_resp_headers = self._fake_resp_headers(resp_headers, len(payload), decompressed=decompressed) resp_http_headers = StatusAndHeaders('200 OK', fake_resp_headers, protocol='HTTP/1.1') warc_headers_dict = OrderedDict() if ip is not None: # ip should be here unless we crawl through a proxy warc_headers_dict['WARC-IP-Address'] = ip if digest is not None: warc_headers_dict['WARC-Payload-Digest'] = digest if is_truncated: if is_truncated in valid_truncations: warc_headers_dict['WARC-Truncated'] = is_truncated else: LOGGER.error('Invalid is_truncation of ' + is_truncated) warc_headers_dict['WARC-Truncated'] = 'unspecified' response = self.writer.create_warc_record( url, 'response', payload=BytesIO(payload), length=len(payload), warc_headers_dict=warc_headers_dict, http_headers=resp_http_headers) self.writer.write_request_response_pair(request, response) self.maybe_close() LOGGER.debug('wrote warc request-response pair%s for url %s', p(self.prefix), url) stats.stats_sum('warc r/r' + p(self.prefix), 1)
class CCWARCWriter: def __init__(self, prefix, max_size, subprefix=None, gzip=True, get_serial=None): self.writer = None self.prefix = prefix self.subprefix = subprefix self.max_size = max_size self.gzip = gzip self.hostname = socket.gethostname() if get_serial is not None: self.external_get_serial = get_serial else: self.external_get_serial = None self.serial = 0 def __del__(self): if self.writer is not None: self.f.close() def create_default_info(self, version, ip, description=None, creator=None, operator=None): ''' creator: # person, organization, service operator: # person, if creator is an organization isPartOf: # name of the crawl ''' info = OrderedDict() info['software'] = 'cocrawler/' + version info['hostname'] = self.hostname info['ip'] = ip if description: info['description'] = description if creator: info['creator'] = creator if operator: info['operator'] = operator info[ 'isPartOf'] = self.prefix # intentionally does not include subprefix info['format'] = 'WARC file version 1.0' self.info = info return info def open(self): filename = self.prefix if self.subprefix: filename += '-' + self.subprefix serial = self.get_serial(filename) filename += '-' + serial + '-' + self.hostname + '.warc' if self.gzip: filename += '.gz' self.filename = filename self.f = open(filename, 'wb') self.writer = WARCWriter(self.f, gzip=self.gzip) record = self.writer.create_warcinfo_record(self.filename, self.info) self.writer.write_record(record) def get_serial(self, filename): if self.external_get_serial is not None: return self.external_get_serial(filename) self.serial += 1 return '{:06}'.format(self.serial - 1) def maybe_close(self): ''' TODO: always close/reopen if subprefix is not None; minimizes open filehandles ''' fsize = os.fstat(self.f.fileno()).st_size if fsize > self.max_size: self.f.close() self.writer = None def write_dns(self, dns, expires, url): # write it out even if empty # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse? # the response object doesn't contain the query type 'A' or 'AAAA' # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A' kind = 'A' # fixme IPV6 ttl = int(expires - time.time()) host = url.hostname if self.writer is None: self.open() payload = timestamp_now() + '\r\n' for r in dns: try: payload += host + '.\t' + str( ttl) + '\tIN\t' + kind + '\t' + r['host'] + '\r\n' except Exception as e: LOGGER.info('problem converting dns reply for warcing', host, r, e) pass payload = payload.encode('utf-8') record = self.writer.create_warc_record('dns:' + host, 'resource', payload=BytesIO(payload), warc_content_type='text/dns', length=len(payload)) self.writer.write_record(record) LOGGER.debug('wrote warc dns response record%s for host %s', p(self.prefix), host) stats.stats_sum('warc dns' + p(self.prefix), 1) def write_request_response_pair(self, url, req_headers, resp_headers, is_truncated, payload, digest=None): if self.writer is None: self.open() # XXX WARC-Identified-Payload-Type set from Apache Tika? (done by Common Crawl) (how expensive?) req_http_headers = StatusAndHeaders( 'GET / HTTP/1.1', headers_to_str_headers(req_headers)) request = self.writer.create_warc_record('http://example.com/', 'request', http_headers=req_http_headers) resp_http_headers = StatusAndHeaders( '200 OK', headers_to_str_headers(resp_headers), protocol='HTTP/1.1') warc_headers_dict = {} if digest is not None: warc_headers_dict['WARC-Payload-Digest'] = digest if is_truncated: if is_truncated in valid_truncations: warc_headers_dict['WARC-Truncated'] = is_truncated else: LOGGER.error('Invalid is_truncation of ' + is_truncated) warc_headers_dict['WARC-Truncated'] = 'unspecified' response = self.writer.create_warc_record( url, 'response', payload=BytesIO(payload), length=len(payload), warc_headers_dict=warc_headers_dict, http_headers=resp_http_headers) self.writer.write_request_response_pair(request, response) self.maybe_close() LOGGER.debug('wrote warc request-response pair%s for url %s', p(self.prefix), url) stats.stats_sum('warc r/r' + p(self.prefix), 1)
class WarcDownloader: """ Download URL with HTTP GET, save to a WARC file and return the decoded text """ def __init__(self, filename, logger_, program_name='corpusbuilder 1.0', user_agent=None, overwrite_warc=True, err_threshold=10, warcinfo_record_data=None, known_bad_urls=None, max_no_of_calls_in_period=2, limit_period=1, proxy_url=None, allow_cookies=False): if known_bad_urls is not None: # Setup the list of cached bad URLs to prevent trying to download them again with open(known_bad_urls, encoding='UTF-8') as fh: self.bad_urls = {line.strip() for line in fh} else: self.bad_urls = set() if not overwrite_warc: # Find out next nonexisting warc filename num = 0 while os.path.exists(filename): filename2, ext = os.path.splitext(filename) # Should be filename.warc.gz if ext == '.gz' and filename2.endswith('.warc'): filename2, ext2 = os.path.splitext(filename2) # Should be filename.warc ext = ext2 + ext # Should be .warc.gz filename = '{0}-{1:05d}{2}'.format(filename2, num, ext) num += 1 logger_.log('INFO', 'Creating archivefile: {0}'.format(filename)) self._output_file = open(filename, 'wb') self._logger_ = logger_ self._req_headers = {'Accept-Encoding': 'identity', 'User-agent': user_agent} self._session = Session() # Setup session for speeding up downloads if proxy_url is not None: # Set socks proxy if provided self._session.proxies['http'] = proxy_url self._session.proxies['https'] = proxy_url self._allow_cookies = allow_cookies # Setup rate limiting to prevent hammering the server self._requests_get = sleep_and_retry(limits(calls=max_no_of_calls_in_period, period=limit_period)(self._http_get_w_cookie_handling)) self._error_count = 0 self._error_threshold = err_threshold # Set the error threshold which cause aborting to prevent deinal self._writer = WARCWriter(self._output_file, gzip=True) if warcinfo_record_data is None: # INFO RECORD # Some custom information about the warc writer program and its settings info_headers = {'software': program_name, 'arguments': ' '.join(sys.argv[1:]), 'format': 'WARC File Format 1.0', 'conformsTo': 'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf'} info_record = self._writer.create_warcinfo_record(filename, info_headers) else: # Must recreate custom headers else they will not be copied custom_headers = ''.join('{0}: {1}\r\n'.format(k, v) for k, v in warcinfo_record_data[1].items()).\ encode('UTF-8') info_record = self._writer.create_warc_record('', 'warcinfo', warc_headers=warcinfo_record_data[0], payload=BytesIO(custom_headers), length=len(custom_headers)) self._writer.write_record(info_record) def __del__(self): if hasattr(self, '_output_file'): # If the program opened a file, then it should gracefully close it on exit! self._output_file.close() def _http_get_w_cookie_handling(self, *args, **kwargs): """ Extend requests.get with optional cookie purging """ if not self._allow_cookies: self._session.cookies.clear() return self._session.get(*args, **kwargs) def _handle_request_exception(self, url, msg): self._logger_.log('WARNING', '\t'.join((url, msg))) self._error_count += 1 if self._error_count >= self._error_threshold: raise NameError('Too many error happened! Threshold exceeded! See log for details!') def download_url(self, url): scheme, netloc, path, params, query, fragment = urlparse(url) path = quote(path) # For safety urlencode the generated URL... url = urlunparse((scheme, netloc, path, params, query, fragment)) if url in self.bad_urls: self._logger_.log('INFO', 'Not downloading known bad URL: {0}'.format(url)) return None try: # The actual request resp = self._requests_get(url, headers=self._req_headers, stream=True) except RequestException as err: self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n' ' The program ignores it and jumps to the next one.'.format(err)) return None if resp.status_code != 200: # Not HTTP 200 OK self._handle_request_exception(url, 'Downloading failed with status code: {0} {1}'.format(resp.status_code, resp.reason)) return None # REQUEST reqv_headers = resp.request.headers reqv_headers['Host'] = netloc proto = 'HTTP/{0}'.format(respv_str[resp.raw.version]) # Friendly protocol name reqv_http_headers = StatusAndHeaders('GET {0} {1}'.format(urlunparse(('', '', path, params, query, fragment)), proto), reqv_headers.items(), is_http_request=True) reqv_record = self._writer.create_warc_record(url, 'request', http_headers=reqv_http_headers) # RESPONSE resp_status = '{0} {1}'.format(resp.status_code, resp.reason) resp_headers_list = resp.raw.headers.items() # get raw headers from urllib3 # Must get peer_name before the content is read # It has no official API for that: # https://github.com/kennethreitz/requests/issues/2158 # https://github.com/urllib3/urllib3/issues/1071 # So workaround to be compatible with windows: # https://stackoverflow.com/questions/22492484/how-do-i-get-the-ip-address-from-a-http-request-using-the-\ # requests-library/22513161#22513161 try: peer_name = resp.raw._connection.sock.getpeername()[0] # Must get peer_name before the content is read except AttributeError: # On Windows there is no getpeername() Attribute of the class... try: peer_name = resp.raw._connection.sock.socket.getpeername()[0] except AttributeError: peer_name = 'None' # Socket closed and could not derermine peername... try: data = resp.raw.read() # To be able to return decoded and also write warc except ProtocolError as err: self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n' ' The program ignores it and jumps to the next one.'.format(err)) return None if len(data) == 0: err = 'Response data has zero length!' self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n' ' The program ignores it and jumps to the next one.'.format(err)) return None enc = resp.encoding # Get or detect encoding to decode the bytes of the text to str if enc is None: enc = detect(data)['encoding'] try: text = data.decode(enc) # Normal decode process except UnicodeDecodeError: self._logger_.log('WARNING', '\t'.join(('DECODE ERROR RETRYING IN \'IGNORE\' MODE:', url, enc))) text = data.decode(enc, 'ignore') data_stream = BytesIO(data) # Need the original byte stream to write the payload to the warc file resp_http_headers = StatusAndHeaders(resp_status, resp_headers_list, protocol=proto) # Add extra headers like encoding because it is not stored any other way... resp_record = self._writer.create_warc_record(url, 'response', payload=data_stream, http_headers=resp_http_headers, warc_headers_dict={'WARC-IP-Address': peer_name, 'WARC-X-Detected-Encoding': enc}) # Everything is OK, write the two WARC records self._writer.write_record(reqv_record) self._writer.write_record(resp_record) return text def write_record(self, record): self._writer.write_record(record)