Beispiel #1
0
    def _add_warc_snapshot(self, filename, url):
        '''Add the snaphot to the WARC file.'''
        _logger.debug('Adding snapshot record.')

        extension = os.path.splitext(filename)[1]
        content_type = {
            '.pdf': 'application/pdf',
            '.html': 'text/html',
            '.png': 'image/png',
            '.gif': 'image/gif'
        }[extension]

        record = WARCRecord()
        record.set_common_fields('resource', content_type)
        record.fields['WARC-Target-URI'] = 'urn:X-wpull:snapshot?url={0}' \
            .format(wpull.url.percent_encode_query_value(url))

        if self._action_warc_record:
            record.fields['WARC-Concurrent-To'] = \
                self._action_warc_record.fields[WARCRecord.WARC_RECORD_ID]

        with open(filename, 'rb') as in_file:
            record.block_file = in_file

            self._warc_recorder.set_length_and_maybe_checksums(record)
            self._warc_recorder.write_record(record)
Beispiel #2
0
    def _write_warc_metadata(self):
        '''Write the JSON metadata to WARC.

        Uses pywb spec.
        '''
        uri = 'metadata://{}{}'.format(
            self._item_session.url_record.url_info.authority,
            self._item_session.url_record.url_info.resource)

        glob_pattern = self._path_prefix + '*.info.json'
        filenames = list(glob.glob(glob_pattern))

        if not filenames:
            _logger.warning(
                __(_(
                    'Could not find external process metadata file: {filename}'
                ),
                   filename=glob_pattern))
            return

        for filename in filenames:
            record = WARCRecord()
            record.set_common_fields(
                'metadata', 'application/vnd.youtube-dl_formats+json')
            record.fields['WARC-Target-URI'] = uri
            record.block_file = open(filename, 'rb')

            self._warc_recorder.set_length_and_maybe_checksums(record)
            self._warc_recorder.write_record(record)

            record.block_file.close()
Beispiel #3
0
    def begin_request(self, request: HTTPRequest):
        assert re.match(
            r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|[a-f0-9:.]+)$',
            request.address[0]), \
            'IP address needed, got {}'.format(request.address[0])

        self._request = request
        self._request_record = record = WARCRecord()
        record.set_common_fields(WARCRecord.REQUEST, WARCRecord.TYPE_REQUEST)
        record.fields['WARC-Target-URI'] = request.url_info.url
        record.fields['WARC-IP-Address'] = request.address[0]
        record.block_file = self._new_temp_file(hint='warcsesreq')
Beispiel #4
0
    def begin_transfer(self, response: FTPResponse):
        hostname, port = response.data_address
        self._write_control_event(
            'Opened data connection to {hostname}:{port}'.format(
                hostname=hostname, port=port))

        self._response_record = record = WARCRecord()
        record.set_common_fields('resource', 'application/octet-stream')
        record.fields['WARC-Target-URI'] = self._request.url_info.url
        record.fields['WARC-IP-Address'] = self._request.address[0]
        record.fields['WARC-Concurrent-To'] = self._control_record.fields[
            WARCRecord.WARC_RECORD_ID]
        record.block_file = self._new_temp_file('warcresp')
Beispiel #5
0
    def begin_response(self, response: HTTPResponse):
        assert re.match(
            r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|[a-f0-9:.]+)$',
            self._request.address[0]), \
            'IP address needed, got {}'.format(self._request.address[0])

        self._response_record = record = WARCRecord()
        record.set_common_fields(WARCRecord.RESPONSE, WARCRecord.TYPE_RESPONSE)
        record.fields['WARC-Target-URI'] = self._request.url_info.url
        record.fields['WARC-IP-Address'] = self._request.address[0]
        record.fields['WARC-Concurrent-To'] = self._request_record.fields[
            WARCRecord.WARC_RECORD_ID]
        record.block_file = self._response_temp_file
Beispiel #6
0
    def close(self):
        '''Close the WARC file and clean up any logging handlers.'''
        if self._log_temp_file:
            self._log_handler.flush()

            logger = logging.getLogger()
            logger.removeHandler(self._log_handler)
            self._log_handler.stream.close()

            log_record = WARCRecord()
            log_record.block_file = gzip.GzipFile(
                filename=self._log_temp_file.name)
            log_record.set_common_fields('resource', 'text/plain')

            log_record.fields['WARC-Target-URI'] = \
                'urn:X-wpull:log'

            if self._params.max_size is not None:
                if self._params.move_to is not None:
                    self._move_file_to_dest_dir(self._warc_filename)

                self._start_new_warc_file(meta=True)

            self.set_length_and_maybe_checksums(log_record)
            self.write_record(log_record)

            log_record.block_file.close()

            try:
                os.remove(self._log_temp_file.name)
            except OSError:
                _logger.exception('Could not close log temp file.')

            self._log_temp_file = None

            self._log_handler.close()
            self._log_handler = None

            if self._params.move_to is not None:
                self._move_file_to_dest_dir(self._warc_filename)

        if self._cdx_filename and self._params.move_to is not None:
            self._move_file_to_dest_dir(self._cdx_filename)
Beispiel #7
0
    def begin_control(self,
                      request: FTPRequest,
                      connection_reused: bool = False):
        self._request = request
        self._control_record = record = WARCRecord()

        record.set_common_fields('metadata', 'text/x-ftp-control-conversation')
        record.fields['WARC-Target-URI'] = request.url_info.url
        record.fields['WARC-IP-Address'] = request.address[0]

        record.block_file = self._new_temp_file('warcctrl')

        hostname, port = self._request_hostname_port()

        if connection_reused:
            connection_string = 'Reusing control connection to {hostname}:{port}'
        else:
            connection_string = 'Opening control connection to {hostname}:{port}'

        self._write_control_event(
            connection_string.format(hostname=hostname, port=port))
Beispiel #8
0
    def _start_new_warc_file(self, meta=False):
        '''Create and set as current WARC file.'''
        if self._params.max_size and not meta and self._params.appending:
            while True:
                self._warc_filename = self._generate_warc_filename()

                if os.path.exists(self._warc_filename):
                    _logger.debug('Skip {0}', self._warc_filename)
                    self._sequence_num += 1
                else:
                    break
        else:
            self._warc_filename = self._generate_warc_filename(meta=meta)

        _logger.debug('WARC file at {0}', self._warc_filename)

        if not self._params.appending:
            wpull.util.truncate_file(self._warc_filename)

        self._warcinfo_record = WARCRecord()
        self._populate_warcinfo(self._params.extra_fields)
        self.write_record(self._warcinfo_record)
Beispiel #9
0
    def _add_warc_action_log(self, path, url):
        '''Add the action log to the WARC file.'''
        _logger.debug('Adding action log record.')

        actions = []
        with open(path, 'r', encoding='utf-8', errors='replace') as file:
            for line in file:
                actions.append(json.loads(line))

        log_data = json.dumps(
            {
                'actions': actions
            },
            indent=4,
        ).encode('utf-8')

        self._action_warc_record = record = WARCRecord()
        record.set_common_fields('metadata', 'application/json')
        record.fields['WARC-Target-URI'] = 'urn:X-wpull:snapshot?url={0}' \
            .format(wpull.url.percent_encode_query_value(url))
        record.block_file = io.BytesIO(log_data)

        self._warc_recorder.set_length_and_maybe_checksums(record)
        self._warc_recorder.write_record(record)