Example #1
0
    def create_index(self):
        self._logger_.log('INFO', 'Creating index...')
        archive_it = ArchiveIterator(self._stream)
        info_rec = next(archive_it)
        # First record should be an info record, then it should be followed by the reqvuest-response pairs
        assert info_rec.rec_type == 'warcinfo'
        custom_headers_raw = info_rec.content_stream().read()  # Parse custom headers
        info_rec_payload = dict(r.split(': ', maxsplit=1) for r in custom_headers_raw.decode('UTF-8')
                                .strip().split('\r\n') if len(r) > 0)
        self.info_record_data = (info_rec.rec_headers, info_rec_payload)  # Info headers in parsed form

        reqv_data = (None, (None, None))  # To be able to handle the request-response pairs together
        for i, record in enumerate(archive_it):
            if record.rec_type == 'request':
                assert i % 2 == 0
                reqv_data = (record.rec_headers.get_header('WARC-Target-URI'),
                             (archive_it.get_record_offset(), archive_it.get_record_length()))
            if record.rec_type == 'response':
                assert i % 2 == 1
                resp_url = record.rec_headers.get_header('WARC-Target-URI')
                assert resp_url == reqv_data[0]
                self.url_index[resp_url] = (reqv_data[1],  # Request-response pair
                                            (archive_it.get_record_offset(), archive_it.get_record_length()))
                self._count += 1
        if self._count != len(self.url_index):
            raise KeyError('Double URL detected in WARC file!')
        if self._count == 0:
            raise IndexError('No index created or no response records in the WARC file!')
        self._stream.seek(0)
        self._logger_.log('INFO', 'Index succesuflly created.')
    def _create_index(self):
        self._logger.log('INFO',
                         'Creating index for {0}...'.format(self.filename))
        archive_it = ArchiveIterator(self._stream,
                                     check_digests=self._check_digest)
        info_rec = next(archive_it)
        # First record should be an info record, then it should be followed by the request-response pairs
        assert info_rec.rec_type == 'warcinfo'
        try:
            # Read out custom headers for later use
            custom_headers_raw = info_rec.content_stream().read(
            )  # Parse custom headers
            if len(custom_headers_raw) == 0:
                raise ValueError('WARCINFO record payload length is 0!')
            # Read and parse the warcinfo record for writing it back unchanged into a warc file
            # else due to warcio problems it will not be copied properly!
            # See: https://github.com/webrecorder/warcio/issues/90
            # and https://github.com/webrecorder/warcio/issues/91
            self.info_record_data = dict(
                r.split(': ', maxsplit=1) for r in custom_headers_raw.decode(
                    'UTF-8').strip().split('\r\n') if len(r) > 0)
        except ValueError as e:
            if self._strict_mode:
                raise e
            self._logger.log('WARNING', 'WARCINFO record in',
                             self._stream.name,
                             'is corrupt! Continuing with a fresh one!')
            self.info_record_data = None

        archive_load_failed = False
        count = 0
        double_urls = Counter()
        reqv_data = (
            None, (None, None)
        )  # To be able to handle the request-response pairs together
        for i, record in enumerate(archive_it):
            if record.rec_type == 'request':
                assert i % 2 == 0
                try:
                    reqv_data = (
                        record.rec_headers.get_header('WARC-Target-URI'),
                        (archive_it.get_record_offset(),
                         archive_it.get_record_length()))
                except ArchiveLoadFailed as e:
                    self._logger.log('ERROR', 'REQUEST:', e.msg, 'for',
                                     reqv_data[0])
                    archive_load_failed = True
            if record.rec_type == 'response':
                assert i % 2 == 1
                resp_url = record.rec_headers.get_header('WARC-Target-URI')
                assert resp_url == reqv_data[0]
                double_urls[resp_url] += 1
                try:
                    self._internal_url_index[resp_url] = (
                        reqv_data[1],  # Request-response pair
                        (archive_it.get_record_offset(),
                         archive_it.get_record_length()))
                except ArchiveLoadFailed as e:
                    self._logger.log('ERROR', 'RESPONSE:', e.msg, 'for',
                                     resp_url)
                    archive_load_failed = True
                count += 1
        if count != len(self._internal_url_index):
            double_urls_str = '\n'.join(
                '{0}\t{1}'.format(url, freq)
                for url, freq in double_urls.most_common() if freq > 1)
            raise KeyError(
                'The following double URLs detected in the WARC file:{0}'.
                format(double_urls_str))
        if count == 0:
            raise IndexError(
                'No index created or no response records in the WARC file!')
        if archive_load_failed and self._strict_mode:
            raise ArchiveLoadFailed(
                'Archive loading failed! See logs for details!')
        self._stream.seek(0)
        self._logger.log('INFO', 'Index succesuflly created.')
Example #3
0
app.config.from_envvar('SWAYBACK_SETTINGS')

htmlindex = []
urlmap = {}
for filename in os.listdir('.'):
    if not filename.endswith('.warc.gz'):
        continue
    print('using', filename)
    with open(filename, 'rb') as stream:
        ai = ArchiveIterator(stream)
        for record in ai:
            if record.rec_type == 'response':
                u = urlparse(record.rec_headers.get_header('WARC-Target-URI'))
                if u not in urlmap:
                    urlmap[u] = (filename, ai.get_record_offset(),
                                 ai.get_record_length())
                httpHeaders = record.http_headers
                if httpHeaders.get_header('content-type',
                                          '').startswith('text/html'):
                    rewrittenUrl = urlunparse(
                        ('http', u.hostname + '.' + app.config['BASE_HOST'],
                         u[2], u[3], u[4], u[5]))
                    htmlindex.append(
                        (urlunparse(u), rewrittenUrl,
                         record.rec_headers.get_header('warc-date')))


@app.route('/', host=app.config['BASE_HOST'])
def index():
    """ A simple index of all HTML pages inside the WARCs """
    return render_template('index.html', index=htmlindex)