Beispiel #1
0
    def create_index(self):
        self._logger_.log('INFO', 'Creating index...')
        archive_it = ArchiveIterator(self._stream)
        info_rec = next(archive_it)
        # First record should be an info record, then it should be followed by the reqvuest-response pairs
        assert info_rec.rec_type == 'warcinfo'
        custom_headers_raw = info_rec.content_stream().read()  # Parse custom headers
        info_rec_payload = dict(r.split(': ', maxsplit=1) for r in custom_headers_raw.decode('UTF-8')
                                .strip().split('\r\n') if len(r) > 0)
        self.info_record_data = (info_rec.rec_headers, info_rec_payload)  # Info headers in parsed form

        reqv_data = (None, (None, None))  # To be able to handle the request-response pairs together
        for i, record in enumerate(archive_it):
            if record.rec_type == 'request':
                assert i % 2 == 0
                reqv_data = (record.rec_headers.get_header('WARC-Target-URI'),
                             (archive_it.get_record_offset(), archive_it.get_record_length()))
            if record.rec_type == 'response':
                assert i % 2 == 1
                resp_url = record.rec_headers.get_header('WARC-Target-URI')
                assert resp_url == reqv_data[0]
                self.url_index[resp_url] = (reqv_data[1],  # Request-response pair
                                            (archive_it.get_record_offset(), archive_it.get_record_length()))
                self._count += 1
        if self._count != len(self.url_index):
            raise KeyError('Double URL detected in WARC file!')
        if self._count == 0:
            raise IndexError('No index created or no response records in the WARC file!')
        self._stream.seek(0)
        self._logger_.log('INFO', 'Index succesuflly created.')
Beispiel #2
0
    def process_one(self, filename):
        printed_filename = False
        with open(filename, 'rb') as stream:
            it = ArchiveIterator(stream, check_digests=True)
            for record in it:
                digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or
                                  record.rec_headers.get_header('WARC-Block-Digest'))

                _read_entire_stream(record.content_stream())

                d_msg = None
                output = []

                rec_id = record.rec_headers.get_header('WARC-Record-ID')
                rec_type = record.rec_headers.get_header('WARC-Type')
                rec_offset = it.get_record_offset()

                if record.digest_checker.passed is False:
                    self.exit_value = 1
                    output = list(record.digest_checker.problems) 
                elif record.digest_checker.passed is True and self.verbose:
                    d_msg = 'digest pass'
                elif record.digest_checker.passed is None and self.verbose:
                    if digest_present and rec_type == 'revisit':
                        d_msg = 'digest present but not checked (revisit)'
                    elif digest_present:  # pragma: no cover
                        # should not happen
                        d_msg = 'digest present but not checked'
                    else:
                        d_msg = 'no digest to check'

                if d_msg or output:
                    if not printed_filename:
                        print(filename)
                        printed_filename = True
                    print(' ', 'offset', rec_offset, 'WARC-Record-ID', rec_id, rec_type)
                    if d_msg:
                        print('   ', d_msg)
                    for o in output:
                        print('   ', o)
Beispiel #3
0
    def validate(self, filepath):
        logger.info(f'Validating {filepath} with Warcio')
        passed = True
        message = f'Successfully validated warc {filepath}'
        val_obj = Validation.objects.create(filename=filepath,
                                            time_started=timezone.now(),
                                            validator=self.__class__.__name__,
                                            required=self.required,
                                            task=self.task,
                                            information_package=self.ip,
                                            responsible=self.responsible,
                                            specification={
                                                'context': self.context,
                                                'options': self.options,
                                            })

        try:
            with open(filepath, 'rb') as stream:
                it = ArchiveIterator(stream, check_digests=True)
                for record in it:
                    digest_present = (
                        record.rec_headers.get_header('WARC-Payload-Digest')
                        or record.rec_headers.get_header('WARC-Block-Digest'))

                    _read_entire_stream(record.content_stream())

                    d_msg = None
                    output = []

                    rec_id = record.rec_headers.get_header('WARC-Record-ID')
                    rec_type = record.rec_headers.get_header('WARC-Type')
                    rec_offset = it.get_record_offset()

                    if record.digest_checker.passed is False:
                        message = record.digest_checker.problems
                        passed = False
                        raise ValidationError(message)

                    elif record.digest_checker.passed is True:
                        d_msg = 'digest pass'
                    elif record.digest_checker.passed is None:
                        if digest_present and rec_type == 'revisit':
                            d_msg = 'digest present but not checked (revisit)'
                        elif digest_present:  # pragma: no cover
                            # should not happen
                            d_msg = 'digest present but not checked'
                        else:
                            d_msg = 'no digest to check'

                    if d_msg:
                        logger.debug(
                            f'offset {rec_offset} WARC-Record-ID {rec_id} {rec_type} ({d_msg})'
                        )
                    if output:
                        logger.debug(
                            f'offset {rec_offset} WARC-Record-ID {rec_id} {rec_type} {output}'
                        )

        except ArchiveLoadFailed as e:
            logger.warning(f'Warcio validation of {filepath} failed')
            passed = False
            message = f'<pre>{traceback.format_exc()}</pre>'
            raise ValidationError(
                f'saw exception ArchiveLoadFailed: {str(e).rstrip()}')

        finally:
            val_obj.message = message
            logger.info(message)
            val_obj.time_done = timezone.now()
            val_obj.passed = passed
            val_obj.save(update_fields=['time_done', 'passed', 'message'])
    def _create_index(self):
        self._logger.log('INFO',
                         'Creating index for {0}...'.format(self.filename))
        archive_it = ArchiveIterator(self._stream,
                                     check_digests=self._check_digest)
        info_rec = next(archive_it)
        # First record should be an info record, then it should be followed by the request-response pairs
        assert info_rec.rec_type == 'warcinfo'
        try:
            # Read out custom headers for later use
            custom_headers_raw = info_rec.content_stream().read(
            )  # Parse custom headers
            if len(custom_headers_raw) == 0:
                raise ValueError('WARCINFO record payload length is 0!')
            # Read and parse the warcinfo record for writing it back unchanged into a warc file
            # else due to warcio problems it will not be copied properly!
            # See: https://github.com/webrecorder/warcio/issues/90
            # and https://github.com/webrecorder/warcio/issues/91
            self.info_record_data = dict(
                r.split(': ', maxsplit=1) for r in custom_headers_raw.decode(
                    'UTF-8').strip().split('\r\n') if len(r) > 0)
        except ValueError as e:
            if self._strict_mode:
                raise e
            self._logger.log('WARNING', 'WARCINFO record in',
                             self._stream.name,
                             'is corrupt! Continuing with a fresh one!')
            self.info_record_data = None

        archive_load_failed = False
        count = 0
        double_urls = Counter()
        reqv_data = (
            None, (None, None)
        )  # To be able to handle the request-response pairs together
        for i, record in enumerate(archive_it):
            if record.rec_type == 'request':
                assert i % 2 == 0
                try:
                    reqv_data = (
                        record.rec_headers.get_header('WARC-Target-URI'),
                        (archive_it.get_record_offset(),
                         archive_it.get_record_length()))
                except ArchiveLoadFailed as e:
                    self._logger.log('ERROR', 'REQUEST:', e.msg, 'for',
                                     reqv_data[0])
                    archive_load_failed = True
            if record.rec_type == 'response':
                assert i % 2 == 1
                resp_url = record.rec_headers.get_header('WARC-Target-URI')
                assert resp_url == reqv_data[0]
                double_urls[resp_url] += 1
                try:
                    self._internal_url_index[resp_url] = (
                        reqv_data[1],  # Request-response pair
                        (archive_it.get_record_offset(),
                         archive_it.get_record_length()))
                except ArchiveLoadFailed as e:
                    self._logger.log('ERROR', 'RESPONSE:', e.msg, 'for',
                                     resp_url)
                    archive_load_failed = True
                count += 1
        if count != len(self._internal_url_index):
            double_urls_str = '\n'.join(
                '{0}\t{1}'.format(url, freq)
                for url, freq in double_urls.most_common() if freq > 1)
            raise KeyError(
                'The following double URLs detected in the WARC file:{0}'.
                format(double_urls_str))
        if count == 0:
            raise IndexError(
                'No index created or no response records in the WARC file!')
        if archive_load_failed and self._strict_mode:
            raise ArchiveLoadFailed(
                'Archive loading failed! See logs for details!')
        self._stream.seek(0)
        self._logger.log('INFO', 'Index succesuflly created.')
Beispiel #5
0
app.config.from_object('swayback.DefaultSettings')
app.config.from_envvar('SWAYBACK_SETTINGS')

htmlindex = []
urlmap = {}
for filename in os.listdir('.'):
    if not filename.endswith('.warc.gz'):
        continue
    print('using', filename)
    with open(filename, 'rb') as stream:
        ai = ArchiveIterator(stream)
        for record in ai:
            if record.rec_type == 'response':
                u = urlparse(record.rec_headers.get_header('WARC-Target-URI'))
                if u not in urlmap:
                    urlmap[u] = (filename, ai.get_record_offset(),
                                 ai.get_record_length())
                httpHeaders = record.http_headers
                if httpHeaders.get_header('content-type',
                                          '').startswith('text/html'):
                    rewrittenUrl = urlunparse(
                        ('http', u.hostname + '.' + app.config['BASE_HOST'],
                         u[2], u[3], u[4], u[5]))
                    htmlindex.append(
                        (urlunparse(u), rewrittenUrl,
                         record.rec_headers.get_header('warc-date')))


@app.route('/', host=app.config['BASE_HOST'])
def index():
    """ A simple index of all HTML pages inside the WARCs """