def create_index(self): self._logger_.log('INFO', 'Creating index...') archive_it = ArchiveIterator(self._stream) info_rec = next(archive_it) # First record should be an info record, then it should be followed by the reqvuest-response pairs assert info_rec.rec_type == 'warcinfo' custom_headers_raw = info_rec.content_stream().read() # Parse custom headers info_rec_payload = dict(r.split(': ', maxsplit=1) for r in custom_headers_raw.decode('UTF-8') .strip().split('\r\n') if len(r) > 0) self.info_record_data = (info_rec.rec_headers, info_rec_payload) # Info headers in parsed form reqv_data = (None, (None, None)) # To be able to handle the request-response pairs together for i, record in enumerate(archive_it): if record.rec_type == 'request': assert i % 2 == 0 reqv_data = (record.rec_headers.get_header('WARC-Target-URI'), (archive_it.get_record_offset(), archive_it.get_record_length())) if record.rec_type == 'response': assert i % 2 == 1 resp_url = record.rec_headers.get_header('WARC-Target-URI') assert resp_url == reqv_data[0] self.url_index[resp_url] = (reqv_data[1], # Request-response pair (archive_it.get_record_offset(), archive_it.get_record_length())) self._count += 1 if self._count != len(self.url_index): raise KeyError('Double URL detected in WARC file!') if self._count == 0: raise IndexError('No index created or no response records in the WARC file!') self._stream.seek(0) self._logger_.log('INFO', 'Index succesuflly created.')
def process_one(self, filename): printed_filename = False with open(filename, 'rb') as stream: it = ArchiveIterator(stream, check_digests=True) for record in it: digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or record.rec_headers.get_header('WARC-Block-Digest')) _read_entire_stream(record.content_stream()) d_msg = None output = [] rec_id = record.rec_headers.get_header('WARC-Record-ID') rec_type = record.rec_headers.get_header('WARC-Type') rec_offset = it.get_record_offset() if record.digest_checker.passed is False: self.exit_value = 1 output = list(record.digest_checker.problems) elif record.digest_checker.passed is True and self.verbose: d_msg = 'digest pass' elif record.digest_checker.passed is None and self.verbose: if digest_present and rec_type == 'revisit': d_msg = 'digest present but not checked (revisit)' elif digest_present: # pragma: no cover # should not happen d_msg = 'digest present but not checked' else: d_msg = 'no digest to check' if d_msg or output: if not printed_filename: print(filename) printed_filename = True print(' ', 'offset', rec_offset, 'WARC-Record-ID', rec_id, rec_type) if d_msg: print(' ', d_msg) for o in output: print(' ', o)
def validate(self, filepath): logger.info(f'Validating {filepath} with Warcio') passed = True message = f'Successfully validated warc {filepath}' val_obj = Validation.objects.create(filename=filepath, time_started=timezone.now(), validator=self.__class__.__name__, required=self.required, task=self.task, information_package=self.ip, responsible=self.responsible, specification={ 'context': self.context, 'options': self.options, }) try: with open(filepath, 'rb') as stream: it = ArchiveIterator(stream, check_digests=True) for record in it: digest_present = ( record.rec_headers.get_header('WARC-Payload-Digest') or record.rec_headers.get_header('WARC-Block-Digest')) _read_entire_stream(record.content_stream()) d_msg = None output = [] rec_id = record.rec_headers.get_header('WARC-Record-ID') rec_type = record.rec_headers.get_header('WARC-Type') rec_offset = it.get_record_offset() if record.digest_checker.passed is False: message = record.digest_checker.problems passed = False raise ValidationError(message) elif record.digest_checker.passed is True: d_msg = 'digest pass' elif record.digest_checker.passed is None: if digest_present and rec_type == 'revisit': d_msg = 'digest present but not checked (revisit)' elif digest_present: # pragma: no cover # should not happen d_msg = 'digest present but not checked' else: d_msg = 'no digest to check' if d_msg: logger.debug( f'offset {rec_offset} WARC-Record-ID {rec_id} {rec_type} ({d_msg})' ) if output: logger.debug( f'offset {rec_offset} WARC-Record-ID {rec_id} {rec_type} {output}' ) except ArchiveLoadFailed as e: logger.warning(f'Warcio validation of {filepath} failed') passed = False message = f'<pre>{traceback.format_exc()}</pre>' raise ValidationError( f'saw exception ArchiveLoadFailed: {str(e).rstrip()}') finally: val_obj.message = message logger.info(message) val_obj.time_done = timezone.now() val_obj.passed = passed val_obj.save(update_fields=['time_done', 'passed', 'message'])
def _create_index(self): self._logger.log('INFO', 'Creating index for {0}...'.format(self.filename)) archive_it = ArchiveIterator(self._stream, check_digests=self._check_digest) info_rec = next(archive_it) # First record should be an info record, then it should be followed by the request-response pairs assert info_rec.rec_type == 'warcinfo' try: # Read out custom headers for later use custom_headers_raw = info_rec.content_stream().read( ) # Parse custom headers if len(custom_headers_raw) == 0: raise ValueError('WARCINFO record payload length is 0!') # Read and parse the warcinfo record for writing it back unchanged into a warc file # else due to warcio problems it will not be copied properly! # See: https://github.com/webrecorder/warcio/issues/90 # and https://github.com/webrecorder/warcio/issues/91 self.info_record_data = dict( r.split(': ', maxsplit=1) for r in custom_headers_raw.decode( 'UTF-8').strip().split('\r\n') if len(r) > 0) except ValueError as e: if self._strict_mode: raise e self._logger.log('WARNING', 'WARCINFO record in', self._stream.name, 'is corrupt! Continuing with a fresh one!') self.info_record_data = None archive_load_failed = False count = 0 double_urls = Counter() reqv_data = ( None, (None, None) ) # To be able to handle the request-response pairs together for i, record in enumerate(archive_it): if record.rec_type == 'request': assert i % 2 == 0 try: reqv_data = ( record.rec_headers.get_header('WARC-Target-URI'), (archive_it.get_record_offset(), archive_it.get_record_length())) except ArchiveLoadFailed as e: self._logger.log('ERROR', 'REQUEST:', e.msg, 'for', reqv_data[0]) archive_load_failed = True if record.rec_type == 'response': assert i % 2 == 1 resp_url = record.rec_headers.get_header('WARC-Target-URI') assert resp_url == reqv_data[0] double_urls[resp_url] += 1 try: self._internal_url_index[resp_url] = ( reqv_data[1], # Request-response pair (archive_it.get_record_offset(), archive_it.get_record_length())) except ArchiveLoadFailed as e: self._logger.log('ERROR', 'RESPONSE:', e.msg, 'for', resp_url) archive_load_failed = True count += 1 if count != len(self._internal_url_index): double_urls_str = '\n'.join( '{0}\t{1}'.format(url, freq) for url, freq in double_urls.most_common() if freq > 1) raise KeyError( 'The following double URLs detected in the WARC file:{0}'. format(double_urls_str)) if count == 0: raise IndexError( 'No index created or no response records in the WARC file!') if archive_load_failed and self._strict_mode: raise ArchiveLoadFailed( 'Archive loading failed! See logs for details!') self._stream.seek(0) self._logger.log('INFO', 'Index succesuflly created.')
app.config.from_object('swayback.DefaultSettings') app.config.from_envvar('SWAYBACK_SETTINGS') htmlindex = [] urlmap = {} for filename in os.listdir('.'): if not filename.endswith('.warc.gz'): continue print('using', filename) with open(filename, 'rb') as stream: ai = ArchiveIterator(stream) for record in ai: if record.rec_type == 'response': u = urlparse(record.rec_headers.get_header('WARC-Target-URI')) if u not in urlmap: urlmap[u] = (filename, ai.get_record_offset(), ai.get_record_length()) httpHeaders = record.http_headers if httpHeaders.get_header('content-type', '').startswith('text/html'): rewrittenUrl = urlunparse( ('http', u.hostname + '.' + app.config['BASE_HOST'], u[2], u[3], u[4], u[5])) htmlindex.append( (urlunparse(u), rewrittenUrl, record.rec_headers.get_header('warc-date'))) @app.route('/', host=app.config['BASE_HOST']) def index(): """ A simple index of all HTML pages inside the WARCs """