Beispiel #1
0
    def _add_warc_snapshot(self, filename, url):
        '''Add the snaphot to the WARC file.'''
        _logger.debug('Adding snapshot record.')

        extension = os.path.splitext(filename)[1]
        content_type = {
            '.pdf': 'application/pdf',
            '.html': 'text/html',
            '.png': 'image/png',
            '.gif': 'image/gif'
            }[extension]

        record = WARCRecord()
        record.set_common_fields('resource', content_type)
        record.fields['WARC-Target-URI'] = 'urn:X-wpull:snapshot?url={0}' \
            .format(wpull.url.percent_encode_query_value(url))

        if self._action_warc_record:
            record.fields['WARC-Concurrent-To'] = \
                self._action_warc_record.fields[WARCRecord.WARC_RECORD_ID]

        with open(filename, 'rb') as in_file:
            record.block_file = in_file

            self._warc_recorder.set_length_and_maybe_checksums(record)
            self._warc_recorder.write_record(record)
Beispiel #2
0
    def _write_warc_metadata(self):
        '''Write the JSON metadata to WARC.

        Uses pywb spec.
        '''
        uri = 'metadata://{}{}'.format(
            self._item_session.url_record.url_info.authority,
            self._item_session.url_record.url_info.resource)

        glob_pattern = self._path_prefix + '*.info.json'
        filenames = list(glob.glob(glob_pattern))

        if not filenames:
            _logger.warning(
                __(_(
                    'Could not find external process metadata file: {filename}'
                ),
                   filename=glob_pattern))
            return

        for filename in filenames:
            record = WARCRecord()
            record.set_common_fields(
                'metadata', 'application/vnd.youtube-dl_formats+json')
            record.fields['WARC-Target-URI'] = uri
            record.block_file = open(filename, 'rb')

            self._warc_recorder.set_length_and_maybe_checksums(record)
            self._warc_recorder.write_record(record)

            record.block_file.close()
Beispiel #3
0
    def _write_warc_metadata(self):
        '''Write the JSON metadata to WARC.

        Uses pywb spec.
        '''
        uri = 'metadata://{}{}'.format(self._item_session.url_record.url_info.authority,
                                       self._item_session.url_record.url_info.resource)

        glob_pattern = self._path_prefix + '*.info.json'
        filenames = list(glob.glob(glob_pattern))

        if not filenames:
            _logger.warning(__(
                _('Could not find external process metadata file: {filename}'),
                filename=glob_pattern
            ))
            return

        for filename in filenames:
            record = WARCRecord()
            record.set_common_fields('metadata', 'application/vnd.youtube-dl_formats+json')
            record.fields['WARC-Target-URI'] = uri
            record.block_file = open(filename, 'rb')

            self._warc_recorder.set_length_and_maybe_checksums(record)
            self._warc_recorder.write_record(record)

            record.block_file.close()
Beispiel #4
0
    def _add_warc_snapshot(self, filename, url):
        '''Add the snaphot to the WARC file.'''
        _logger.debug('Adding snapshot record.')

        extension = os.path.splitext(filename)[1]
        content_type = {
            '.pdf': 'application/pdf',
            '.html': 'text/html',
            '.png': 'image/png',
            '.gif': 'image/gif'
        }[extension]

        record = WARCRecord()
        record.set_common_fields('resource', content_type)
        record.fields['WARC-Target-URI'] = 'urn:X-wpull:snapshot?url={0}' \
            .format(wpull.url.percent_encode_query_value(url))

        if self._action_warc_record:
            record.fields['WARC-Concurrent-To'] = \
                self._action_warc_record.fields[WARCRecord.WARC_RECORD_ID]

        with open(filename, 'rb') as in_file:
            record.block_file = in_file

            self._warc_recorder.set_length_and_maybe_checksums(record)
            self._warc_recorder.write_record(record)
Beispiel #5
0
    def close(self):
        '''Close the WARC file and clean up any logging handlers.'''
        if self._log_temp_file:
            self._log_handler.flush()

            logger = logging.getLogger()
            logger.removeHandler(self._log_handler)
            self._log_handler.stream.close()

            log_record = WARCRecord()
            log_record.block_file = gzip.GzipFile(
                filename=self._log_temp_file.name
            )
            log_record.set_common_fields('resource', 'text/plain')

            log_record.fields['WARC-Target-URI'] = \
                'urn:X-wpull:log'

            if self._params.max_size is not None:
                if self._params.move_to is not None:
                    self._move_file_to_dest_dir(self._warc_filename)

                self._start_new_warc_file(meta=True)

            self.set_length_and_maybe_checksums(log_record)
            self.write_record(log_record)

            log_record.block_file.close()

            try:
                os.remove(self._log_temp_file.name)
            except OSError:
                _logger.exception('Could not close log temp file.')

            self._log_temp_file = None

            self._log_handler.close()
            self._log_handler = None

            if self._params.move_to is not None:
                self._move_file_to_dest_dir(self._warc_filename)

        if self._cdx_filename and self._params.move_to is not None:
            self._move_file_to_dest_dir(self._cdx_filename)
Beispiel #6
0
    def close(self):
        '''Close the WARC file and clean up any logging handlers.'''
        if self._log_temp_file:
            self._log_handler.flush()

            logger = logging.getLogger()
            logger.removeHandler(self._log_handler)
            self._log_handler.stream.close()

            log_record = WARCRecord()
            log_record.block_file = gzip.GzipFile(
                filename=self._log_temp_file.name)
            log_record.set_common_fields('resource', 'text/plain')

            log_record.fields['WARC-Target-URI'] = \
                'urn:X-wpull:log'

            if self._params.max_size is not None:
                if self._params.move_to is not None:
                    self._move_file_to_dest_dir(self._warc_filename)

                self._start_new_warc_file(meta=True)

            self.set_length_and_maybe_checksums(log_record)
            self.write_record(log_record)

            log_record.block_file.close()

            try:
                os.remove(self._log_temp_file.name)
            except OSError:
                _logger.exception('Could not close log temp file.')

            self._log_temp_file = None

            self._log_handler.close()
            self._log_handler = None

            if self._params.move_to is not None:
                self._move_file_to_dest_dir(self._warc_filename)

        if self._cdx_filename and self._params.move_to is not None:
            self._move_file_to_dest_dir(self._cdx_filename)