def create_index(self): self._logger_.log('INFO', 'Creating index...') archive_it = ArchiveIterator(self._stream) info_rec = next(archive_it) # First record should be an info record, then it should be followed by the reqvuest-response pairs assert info_rec.rec_type == 'warcinfo' custom_headers_raw = info_rec.content_stream().read() # Parse custom headers info_rec_payload = dict(r.split(': ', maxsplit=1) for r in custom_headers_raw.decode('UTF-8') .strip().split('\r\n') if len(r) > 0) self.info_record_data = (info_rec.rec_headers, info_rec_payload) # Info headers in parsed form reqv_data = (None, (None, None)) # To be able to handle the request-response pairs together for i, record in enumerate(archive_it): if record.rec_type == 'request': assert i % 2 == 0 reqv_data = (record.rec_headers.get_header('WARC-Target-URI'), (archive_it.get_record_offset(), archive_it.get_record_length())) if record.rec_type == 'response': assert i % 2 == 1 resp_url = record.rec_headers.get_header('WARC-Target-URI') assert resp_url == reqv_data[0] self.url_index[resp_url] = (reqv_data[1], # Request-response pair (archive_it.get_record_offset(), archive_it.get_record_length())) self._count += 1 if self._count != len(self.url_index): raise KeyError('Double URL detected in WARC file!') if self._count == 0: raise IndexError('No index created or no response records in the WARC file!') self._stream.seek(0) self._logger_.log('INFO', 'Index succesuflly created.')
def _create_index(self): self._logger.log('INFO', 'Creating index for {0}...'.format(self.filename)) archive_it = ArchiveIterator(self._stream, check_digests=self._check_digest) info_rec = next(archive_it) # First record should be an info record, then it should be followed by the request-response pairs assert info_rec.rec_type == 'warcinfo' try: # Read out custom headers for later use custom_headers_raw = info_rec.content_stream().read( ) # Parse custom headers if len(custom_headers_raw) == 0: raise ValueError('WARCINFO record payload length is 0!') # Read and parse the warcinfo record for writing it back unchanged into a warc file # else due to warcio problems it will not be copied properly! # See: https://github.com/webrecorder/warcio/issues/90 # and https://github.com/webrecorder/warcio/issues/91 self.info_record_data = dict( r.split(': ', maxsplit=1) for r in custom_headers_raw.decode( 'UTF-8').strip().split('\r\n') if len(r) > 0) except ValueError as e: if self._strict_mode: raise e self._logger.log('WARNING', 'WARCINFO record in', self._stream.name, 'is corrupt! Continuing with a fresh one!') self.info_record_data = None archive_load_failed = False count = 0 double_urls = Counter() reqv_data = ( None, (None, None) ) # To be able to handle the request-response pairs together for i, record in enumerate(archive_it): if record.rec_type == 'request': assert i % 2 == 0 try: reqv_data = ( record.rec_headers.get_header('WARC-Target-URI'), (archive_it.get_record_offset(), archive_it.get_record_length())) except ArchiveLoadFailed as e: self._logger.log('ERROR', 'REQUEST:', e.msg, 'for', reqv_data[0]) archive_load_failed = True if record.rec_type == 'response': assert i % 2 == 1 resp_url = record.rec_headers.get_header('WARC-Target-URI') assert resp_url == reqv_data[0] double_urls[resp_url] += 1 try: self._internal_url_index[resp_url] = ( reqv_data[1], # Request-response pair (archive_it.get_record_offset(), archive_it.get_record_length())) except ArchiveLoadFailed as e: self._logger.log('ERROR', 'RESPONSE:', e.msg, 'for', resp_url) archive_load_failed = True count += 1 if count != len(self._internal_url_index): double_urls_str = '\n'.join( '{0}\t{1}'.format(url, freq) for url, freq in double_urls.most_common() if freq > 1) raise KeyError( 'The following double URLs detected in the WARC file:{0}'. format(double_urls_str)) if count == 0: raise IndexError( 'No index created or no response records in the WARC file!') if archive_load_failed and self._strict_mode: raise ArchiveLoadFailed( 'Archive loading failed! See logs for details!') self._stream.seek(0) self._logger.log('INFO', 'Index succesuflly created.')
app.config.from_envvar('SWAYBACK_SETTINGS') htmlindex = [] urlmap = {} for filename in os.listdir('.'): if not filename.endswith('.warc.gz'): continue print('using', filename) with open(filename, 'rb') as stream: ai = ArchiveIterator(stream) for record in ai: if record.rec_type == 'response': u = urlparse(record.rec_headers.get_header('WARC-Target-URI')) if u not in urlmap: urlmap[u] = (filename, ai.get_record_offset(), ai.get_record_length()) httpHeaders = record.http_headers if httpHeaders.get_header('content-type', '').startswith('text/html'): rewrittenUrl = urlunparse( ('http', u.hostname + '.' + app.config['BASE_HOST'], u[2], u[3], u[4], u[5])) htmlindex.append( (urlunparse(u), rewrittenUrl, record.rec_headers.get_header('warc-date'))) @app.route('/', host=app.config['BASE_HOST']) def index(): """ A simple index of all HTML pages inside the WARCs """ return render_template('index.html', index=htmlindex)