Beispiel #1
0
    def _write_warc_metadata(self):
        '''Write the JSON metadata to WARC.

        Uses pywb spec.
        '''
        uri = 'metadata://{}{}'.format(self._item_session.url_record.url_info.authority,
                                       self._item_session.url_record.url_info.resource)

        glob_pattern = self._path_prefix + '*.info.json'
        filenames = list(glob.glob(glob_pattern))

        if not filenames:
            _logger.warning(__(
                _('Could not find external process metadata file: {filename}'),
                filename=glob_pattern
            ))
            return

        for filename in filenames:
            record = WARCRecord()
            record.set_common_fields('metadata', 'application/vnd.youtube-dl_formats+json')
            record.fields['WARC-Target-URI'] = uri
            record.block_file = open(filename, 'rb')

            self._warc_recorder.set_length_and_maybe_checksums(record)
            self._warc_recorder.write_record(record)

            record.block_file.close()
Beispiel #2
0
    def _add_warc_snapshot(self, filename, url):
        '''Add the snaphot to the WARC file.'''
        _logger.debug('Adding snapshot record.')

        extension = os.path.splitext(filename)[1]
        content_type = {
            '.pdf': 'application/pdf',
            '.html': 'text/html',
            '.png': 'image/png',
            '.gif': 'image/gif'
        }[extension]

        record = WARCRecord()
        record.set_common_fields('resource', content_type)
        record.fields['WARC-Target-URI'] = 'urn:X-wpull:snapshot?url={0}' \
            .format(wpull.url.percent_encode_query_value(url))

        if self._action_warc_record:
            record.fields['WARC-Concurrent-To'] = \
                self._action_warc_record.fields[WARCRecord.WARC_RECORD_ID]

        with open(filename, 'rb') as in_file:
            record.block_file = in_file

            self._warc_recorder.set_length_and_maybe_checksums(record)
            self._warc_recorder.write_record(record)
Beispiel #3
0
    def close(self):
        '''Close the WARC file and clean up any logging handlers.'''
        if self._log_temp_file:
            self._log_handler.flush()

            logger = logging.getLogger()
            logger.removeHandler(self._log_handler)
            self._log_handler.stream.close()

            log_record = WARCRecord()
            log_record.block_file = gzip.GzipFile(
                filename=self._log_temp_file.name
            )
            log_record.set_common_fields('resource', 'text/plain')

            log_record.fields['WARC-Target-URI'] = \
                'urn:X-wpull:log'

            if self._params.max_size is not None:
                if self._params.move_to is not None:
                    self._move_file_to_dest_dir(self._warc_filename)

                self._start_new_warc_file(meta=True)

            self.set_length_and_maybe_checksums(log_record)
            self.write_record(log_record)

            log_record.block_file.close()

            try:
                os.remove(self._log_temp_file.name)
            except OSError:
                _logger.exception('Could not close log temp file.')

            self._log_temp_file = None

            self._log_handler.close()
            self._log_handler = None

            if self._params.move_to is not None:
                self._move_file_to_dest_dir(self._warc_filename)

        if self._cdx_filename and self._params.move_to is not None:
            self._move_file_to_dest_dir(self._cdx_filename)
Beispiel #4
0
    def begin_request(self, request: HTTPRequest):
        assert re.match(
            r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|[a-f0-9:.]+)$',
            request.address[0]), \
            'IP address needed, got {}'.format(request.address[0])

        self._request = request
        self._request_record = record = WARCRecord()
        record.set_common_fields(WARCRecord.REQUEST, WARCRecord.TYPE_REQUEST)
        record.fields['WARC-Target-URI'] = request.url_info.url
        record.fields['WARC-IP-Address'] = request.address[0]
        record.block_file = self._new_temp_file(hint='warcsesreq')
Beispiel #5
0
    def _add_warc_snapshot(self, filename, url):
        '''Add the snaphot to the WARC file.'''
        _logger.debug('Adding snapshot record.')

        extension = os.path.splitext(filename)[1]
        content_type = {
            '.pdf': 'application/pdf',
            '.html': 'text/html',
            '.png': 'image/png',
            '.gif': 'image/gif'
            }[extension]

        record = WARCRecord()
        record.set_common_fields('resource', content_type)
        record.fields['WARC-Target-URI'] = 'urn:X-wpull:snapshot?url={0}' \
            .format(wpull.url.percent_encode_query_value(url))

        if self._action_warc_record:
            record.fields['WARC-Concurrent-To'] = \
                self._action_warc_record.fields[WARCRecord.WARC_RECORD_ID]

        with open(filename, 'rb') as in_file:
            record.block_file = in_file

            self._warc_recorder.set_length_and_maybe_checksums(record)
            self._warc_recorder.write_record(record)
Beispiel #6
0
    def _write_warc_metadata(self):
        '''Write the JSON metadata to WARC.

        Uses pywb spec.
        '''
        uri = 'metadata://{}{}'.format(
            self._item_session.url_record.url_info.authority,
            self._item_session.url_record.url_info.resource)

        glob_pattern = self._path_prefix + '*.info.json'
        filenames = list(glob.glob(glob_pattern))

        if not filenames:
            _logger.warning(
                __(_(
                    'Could not find external process metadata file: {filename}'
                ),
                   filename=glob_pattern))
            return

        for filename in filenames:
            record = WARCRecord()
            record.set_common_fields(
                'metadata', 'application/vnd.youtube-dl_formats+json')
            record.fields['WARC-Target-URI'] = uri
            record.block_file = open(filename, 'rb')

            self._warc_recorder.set_length_and_maybe_checksums(record)
            self._warc_recorder.write_record(record)

            record.block_file.close()
Beispiel #7
0
    def begin_transfer(self, response: FTPResponse):
        hostname, port = response.data_address
        self._write_control_event(
            'Opened data connection to {hostname}:{port}'.format(
                hostname=hostname, port=port))

        self._response_record = record = WARCRecord()
        record.set_common_fields('resource', 'application/octet-stream')
        record.fields['WARC-Target-URI'] = self._request.url_info.url
        record.fields['WARC-IP-Address'] = self._request.address[0]
        record.fields['WARC-Concurrent-To'] = self._control_record.fields[
            WARCRecord.WARC_RECORD_ID]
        record.block_file = self._new_temp_file('warcresp')
Beispiel #8
0
    def begin_response(self, response: HTTPResponse):
        assert re.match(
            r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|[a-f0-9:.]+)$',
            self._request.address[0]), \
            'IP address needed, got {}'.format(self._request.address[0])

        self._response_record = record = WARCRecord()
        record.set_common_fields(WARCRecord.RESPONSE, WARCRecord.TYPE_RESPONSE)
        record.fields['WARC-Target-URI'] = self._request.url_info.url
        record.fields['WARC-IP-Address'] = self._request.address[0]
        record.fields['WARC-Concurrent-To'] = self._request_record.fields[
            WARCRecord.WARC_RECORD_ID]
        record.block_file = self._response_temp_file
Beispiel #9
0
    def _start_new_warc_file(self, meta=False):
        '''Create and set as current WARC file.'''
        if self._params.max_size and not meta and self._params.appending:
            while True:
                self._warc_filename = self._generate_warc_filename()

                if os.path.exists(self._warc_filename):
                    _logger.debug('Skip {0}', self._warc_filename)
                    self._sequence_num += 1
                else:
                    break
        else:
            self._warc_filename = self._generate_warc_filename(meta=meta)

        _logger.debug('WARC file at {0}', self._warc_filename)

        if not self._params.appending:
            wpull.util.truncate_file(self._warc_filename)

        self._warcinfo_record = WARCRecord()
        self._populate_warcinfo(self._params.extra_fields)
        self.write_record(self._warcinfo_record)
Beispiel #10
0
    def begin_control(self,
                      request: FTPRequest,
                      connection_reused: bool = False):
        self._request = request
        self._control_record = record = WARCRecord()

        record.set_common_fields('metadata', 'text/x-ftp-control-conversation')
        record.fields['WARC-Target-URI'] = request.url_info.url
        record.fields['WARC-IP-Address'] = request.address[0]

        record.block_file = self._new_temp_file('warcctrl')

        hostname, port = self._request_hostname_port()

        if connection_reused:
            connection_string = 'Reusing control connection to {hostname}:{port}'
        else:
            connection_string = 'Opening control connection to {hostname}:{port}'

        self._write_control_event(
            connection_string.format(hostname=hostname, port=port))
Beispiel #11
0
    def _start_new_warc_file(self, meta=False):
        '''Create and set as current WARC file.'''
        if self._params.max_size and not meta and self._params.appending:
            while True:
                self._warc_filename = self._generate_warc_filename()

                if os.path.exists(self._warc_filename):
                    _logger.debug('Skip {0}', self._warc_filename)
                    self._sequence_num += 1
                else:
                    break
        else:
            self._warc_filename = self._generate_warc_filename(meta=meta)

        _logger.debug('WARC file at {0}', self._warc_filename)

        if not self._params.appending:
            wpull.util.truncate_file(self._warc_filename)

        self._warcinfo_record = WARCRecord()
        self._populate_warcinfo(self._params.extra_fields)
        self.write_record(self._warcinfo_record)
Beispiel #12
0
    def _add_warc_action_log(self, path, url):
        '''Add the action log to the WARC file.'''
        _logger.debug('Adding action log record.')

        actions = []
        with open(path, 'r', encoding='utf-8', errors='replace') as file:
            for line in file:
                actions.append(json.loads(line))

        log_data = json.dumps(
            {
                'actions': actions
            },
            indent=4,
        ).encode('utf-8')

        self._action_warc_record = record = WARCRecord()
        record.set_common_fields('metadata', 'application/json')
        record.fields['WARC-Target-URI'] = 'urn:X-wpull:snapshot?url={0}' \
            .format(wpull.url.percent_encode_query_value(url))
        record.block_file = io.BytesIO(log_data)

        self._warc_recorder.set_length_and_maybe_checksums(record)
        self._warc_recorder.write_record(record)
Beispiel #13
0
    def close(self):
        '''Close the WARC file and clean up any logging handlers.'''
        if self._log_temp_file:
            self._log_handler.flush()

            logger = logging.getLogger()
            logger.removeHandler(self._log_handler)
            self._log_handler.stream.close()

            log_record = WARCRecord()
            log_record.block_file = gzip.GzipFile(
                filename=self._log_temp_file.name)
            log_record.set_common_fields('resource', 'text/plain')

            log_record.fields['WARC-Target-URI'] = \
                'urn:X-wpull:log'

            if self._params.max_size is not None:
                if self._params.move_to is not None:
                    self._move_file_to_dest_dir(self._warc_filename)

                self._start_new_warc_file(meta=True)

            self.set_length_and_maybe_checksums(log_record)
            self.write_record(log_record)

            log_record.block_file.close()

            try:
                os.remove(self._log_temp_file.name)
            except OSError:
                _logger.exception('Could not close log temp file.')

            self._log_temp_file = None

            self._log_handler.close()
            self._log_handler = None

            if self._params.move_to is not None:
                self._move_file_to_dest_dir(self._warc_filename)

        if self._cdx_filename and self._params.move_to is not None:
            self._move_file_to_dest_dir(self._cdx_filename)
Beispiel #14
0
class WARCRecorder(object):
    '''Record to WARC file.

    Args:
        filename (str): The filename (without the extension).
        params (:class:`WARCRecorderParams`): Parameters.
    '''
    CDX_DELIMINATOR = ' '
    '''Default CDX delimiter.'''
    DEFAULT_SOFTWARE_STRING = 'Wpull/{0} Python/{1}'.format(
        wpull.version.__version__, wpull.util.python_version()
    )
    '''Default software string.'''

    def __init__(self, filename, params=None):
        self._prefix_filename = filename
        self._params = params or WARCRecorderParams()
        self._warcinfo_record = None
        self._sequence_num = 0
        self._log_temp_file = None
        self._log_handler = None
        self._warc_filename = None
        self._cdx_filename = None

        self._check_journals_and_maybe_raise()

        if params.log:
            self._setup_log()

        self._start_new_warc_file()

        if self._params.cdx:
            self._start_new_cdx_file()

    def _check_journals_and_maybe_raise(self):
        '''Check if any journal files exist and raise an error.'''
        files = list(glob.glob(self._prefix_filename + '*-wpullinc'))

        if files:
            raise OSError('WARC file {} is incomplete.'.format(files[0]))

    def _start_new_warc_file(self, meta=False):
        '''Create and set as current WARC file.'''
        if self._params.max_size and not meta and self._params.appending:
            while True:
                self._warc_filename = self._generate_warc_filename()

                if os.path.exists(self._warc_filename):
                    _logger.debug('Skip {0}', self._warc_filename)
                    self._sequence_num += 1
                else:
                    break
        else:
            self._warc_filename = self._generate_warc_filename(meta=meta)

        _logger.debug('WARC file at {0}', self._warc_filename)

        if not self._params.appending:
            wpull.util.truncate_file(self._warc_filename)

        self._warcinfo_record = WARCRecord()
        self._populate_warcinfo(self._params.extra_fields)
        self.write_record(self._warcinfo_record)

    def _generate_warc_filename(self, meta=False):
        '''Return a suitable WARC filename.'''
        if self._params.max_size is None:
            sequence_name = ''
        elif meta:
            sequence_name = '-meta'
        else:
            sequence_name = '-{0:05d}'.format(self._sequence_num)

        if self._params.compress:
            extension = 'warc.gz'
        else:
            extension = 'warc'

        return '{0}{1}.{2}'.format(
            self._prefix_filename, sequence_name, extension
        )

    def _start_new_cdx_file(self):
        '''Create and set current CDX file.'''
        self._cdx_filename = '{0}.cdx'.format(self._prefix_filename)

        if not self._params.appending:
            wpull.util.truncate_file(self._cdx_filename)
            self._write_cdx_header()
        elif not os.path.exists(self._cdx_filename):
            self._write_cdx_header()

    def _populate_warcinfo(self, extra_fields=None):
        '''Add the metadata to the Warcinfo record.'''
        self._warcinfo_record.set_common_fields(
            WARCRecord.WARCINFO, WARCRecord.WARC_FIELDS)

        info_fields = NameValueRecord(wrap_width=1024)
        info_fields['Software'] = self._params.software_string \
            or self.DEFAULT_SOFTWARE_STRING
        info_fields['format'] = 'WARC File Format 1.0'
        info_fields['conformsTo'] = \
            'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf'

        if extra_fields:
            for name, value in extra_fields:
                info_fields.add(name, value)

        self._warcinfo_record.block_file = io.BytesIO(
            bytes(info_fields) + b'\r\n')
        self._warcinfo_record.compute_checksum()

    def _setup_log(self):
        '''Set up the logging file.'''
        logger = logging.getLogger()
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        self._log_temp_file = NamedTemporaryFile(
            prefix='tmp-wpull-warc-',
            dir=self._params.temp_dir,
            suffix='.log.gz',
            delete=False,
        )
        self._log_temp_file.close()  # For Windows

        self._log_handler = handler = logging.StreamHandler(
            io.TextIOWrapper(
                gzip.GzipFile(
                    filename=self._log_temp_file.name, mode='wb'
                ),
                encoding='utf-8'
            )
        )

        logger.setLevel(logging.DEBUG)
        logger.debug('Wpull needs the root logger level set to DEBUG.')

        handler.setFormatter(formatter)
        logger.addHandler(handler)
        handler.setLevel(logging.INFO)

    def listen_to_http_client(self, client: HTTPClient):
        client.event_dispatcher.add_listener(HTTPClient.ClientEvent.new_session,
                                             self._http_session_callback)

    def _http_session_callback(self, http_session: HTTPSession):
        recorder_session = self.new_http_recorder_session()

        http_session.event_dispatcher.add_listener(
            HTTPSession.Event.begin_request, recorder_session.begin_request)
        http_session.event_dispatcher.add_listener(
            HTTPSession.Event.request_data, recorder_session.request_data)
        http_session.event_dispatcher.add_listener(
            HTTPSession.Event.end_request, recorder_session.end_request)
        http_session.event_dispatcher.add_listener(
            HTTPSession.Event.begin_response, recorder_session.begin_response)
        http_session.event_dispatcher.add_listener(
            HTTPSession.Event.response_data, recorder_session.response_data)
        http_session.event_dispatcher.add_listener(
            HTTPSession.Event.end_response, recorder_session.end_response)

        http_session.event_dispatcher.add_listener(
            HTTPSession.SessionEvent.end_session,
            lambda error: recorder_session.close()
        )

    def new_http_recorder_session(self) -> 'HTTPWARCRecorderSession':
        return HTTPWARCRecorderSession(
            self, temp_dir=self._params.temp_dir,
            url_table=self._params.url_table
        )

    def listen_to_ftp_client(self, client: FTPClient):
        client.event_dispatcher.add_listener(FTPClient.ClientEvent.new_session,
                                             self._ftp_session_callback)

    def _ftp_session_callback(self, ftp_session: FTPSession):
        recorder_session = self.new_ftp_recorder_session()

        ftp_session.event_dispatcher.add_listener(
            FTPSession.Event.begin_control, recorder_session.begin_control)
        ftp_session.event_dispatcher.add_listener(
            FTPSession.Event.control_receive_data,
            recorder_session.control_receive_data)
        ftp_session.event_dispatcher.add_listener(
            FTPSession.Event.control_send_data,
            recorder_session.control_send_data)
        ftp_session.event_dispatcher.add_listener(
            FTPSession.Event.end_control, recorder_session.end_control)

        ftp_session.event_dispatcher.add_listener(
            FTPSession.Event.begin_transfer, recorder_session.begin_transfer)
        ftp_session.event_dispatcher.add_listener(
            FTPSession.Event.transfer_receive_data,
            recorder_session.transfer_receive_data)
        ftp_session.event_dispatcher.add_listener(
            FTPSession.Event.end_transfer, recorder_session.end_transfer)

        ftp_session.event_dispatcher.add_listener(
            FTPSession.SessionEvent.end_session,
            lambda error: recorder_session.close()
        )

    def new_ftp_recorder_session(self) -> 'FTPWARCRecorderSession':
        return FTPWARCRecorderSession(
            self, temp_dir=self._params.temp_dir,
            url_table=self._params.url_table
        )

    def flush_session(self):
        if self._params.max_size is not None \
           and os.path.getsize(self._warc_filename) > self._params.max_size:
            self._sequence_num += 1

            if self._params.move_to is not None:
                self._move_file_to_dest_dir(self._warc_filename)

            _logger.debug('Starting new warc file due to max size.')
            self._start_new_warc_file()

    def _move_file_to_dest_dir(self, filename):
        '''Move the file to the ``move_to`` directory.'''
        assert self._params.move_to

        if os.path.isdir(self._params.move_to):
            _logger.debug('Moved {} to {}.', self._warc_filename,
                          self._params.move_to)
            shutil.move(filename, self._params.move_to)
        else:
            _logger.error('{} is not a directory; not moving {}.',
                          self._params.move_to, filename)

    def set_length_and_maybe_checksums(self, record, payload_offset=None):
        '''Set the content length and possibly the checksums.'''
        if self._params.digests:
            record.compute_checksum(payload_offset)
        else:
            record.set_content_length()

    def write_record(self, record):
        '''Append the record to the WARC file.'''
        # FIXME: probably not a good idea to modifiy arguments passed to us
        # TODO: add extra gzip headers that wget uses
        record.fields['WARC-Warcinfo-ID'] = self._warcinfo_record.fields[
            WARCRecord.WARC_RECORD_ID]

        _logger.debug('Writing WARC record {0}.',
                      record.fields['WARC-Type'])

        if self._params.compress:
            open_func = gzip.GzipFile
        else:
            open_func = open

        # Use getsize to get actual file size. Avoid tell() because it may
        # not be the raw file position.
        if os.path.exists(self._warc_filename):
            before_offset = os.path.getsize(self._warc_filename)
        else:
            before_offset = 0

        journal_filename = self._warc_filename + '-wpullinc'

        with open(journal_filename, 'w') as file:
            file.write('wpull-journal-version:1\n')
            file.write('offset:{}\n'.format(before_offset))

        try:
            with open_func(self._warc_filename, mode='ab') as out_file:
                for data in record:
                    out_file.write(data)
        except (OSError, IOError) as error:
            _logger.info(
                _('Rolling back file {filename} to length {length}.'),
                filename=self._warc_filename, length=before_offset
            )
            with open(self._warc_filename, mode='wb') as out_file:
                out_file.truncate(before_offset)

            raise error
        finally:
            os.remove(journal_filename)

        after_offset = os.path.getsize(self._warc_filename)

        if self._cdx_filename:
            raw_file_offset = before_offset
            raw_file_record_size = after_offset - before_offset

            self._write_cdx_field(
                record, raw_file_record_size, raw_file_offset
            )

    def close(self):
        '''Close the WARC file and clean up any logging handlers.'''
        if self._log_temp_file:
            self._log_handler.flush()

            logger = logging.getLogger()
            logger.removeHandler(self._log_handler)
            self._log_handler.stream.close()

            log_record = WARCRecord()
            log_record.block_file = gzip.GzipFile(
                filename=self._log_temp_file.name
            )
            log_record.set_common_fields('resource', 'text/plain')

            log_record.fields['WARC-Target-URI'] = \
                'urn:X-wpull:log'

            if self._params.max_size is not None:
                if self._params.move_to is not None:
                    self._move_file_to_dest_dir(self._warc_filename)

                self._start_new_warc_file(meta=True)

            self.set_length_and_maybe_checksums(log_record)
            self.write_record(log_record)

            log_record.block_file.close()

            try:
                os.remove(self._log_temp_file.name)
            except OSError:
                _logger.exception('Could not close log temp file.')

            self._log_temp_file = None

            self._log_handler.close()
            self._log_handler = None

            if self._params.move_to is not None:
                self._move_file_to_dest_dir(self._warc_filename)

        if self._cdx_filename and self._params.move_to is not None:
            self._move_file_to_dest_dir(self._cdx_filename)

    def _write_cdx_header(self):
        '''Write the CDX header.

        It writes the fields:

        1. a: original URL
        2. b: UNIX timestamp
        3. m: MIME Type from the HTTP Content-type
        4. s: response code
        5. k: new style checksum
        6. S: raw file record size
        7. V: offset in raw file
        8. g: filename of raw file
        9. u: record ID
        '''
        with open(self._cdx_filename, mode='a', encoding='utf-8') as out_file:
            out_file.write(self.CDX_DELIMINATOR)
            out_file.write(self.CDX_DELIMINATOR.join((
                'CDX',
                'a', 'b', 'm', 's',
                'k', 'S', 'V', 'g',
                'u'
            )))
            out_file.write('\n')

    def _write_cdx_field(self, record, raw_file_record_size, raw_file_offset):
        '''Write the CDX field if needed.'''
        if record.fields[WARCRecord.WARC_TYPE] != WARCRecord.RESPONSE \
           or not re.match(r'application/http; *msgtype *= *response',
                           record.fields[WARCRecord.CONTENT_TYPE]):
            return

        url = record.fields['WARC-Target-URI']

        _logger.debug('Writing CDX record {0}.', url)

        http_header = record.get_http_header()

        if http_header:
            mime_type = self.parse_mimetype(
                http_header.fields.get('Content-Type', '')
            ) or '-'
            response_code = str(http_header.status_code)
        else:
            mime_type = '-'
            response_code = '-'

        timestamp = str(int(
            wpull.util.parse_iso8601_str(record.fields[WARCRecord.WARC_DATE])
        ))

        checksum = record.fields.get('WARC-Payload-Digest', '')

        if checksum.startswith('sha1:'):
            checksum = checksum.replace('sha1:', '', 1)
        else:
            checksum = '-'

        raw_file_record_size_str = str(raw_file_record_size)
        raw_file_offset_str = str(raw_file_offset)
        filename = os.path.basename(self._warc_filename)
        record_id = record.fields[WARCRecord.WARC_RECORD_ID]
        fields_strs = (
            url,
            timestamp,
            mime_type,
            response_code,
            checksum,
            raw_file_record_size_str,
            raw_file_offset_str,
            filename,
            record_id
        )

        with open(self._cdx_filename, mode='a', encoding='utf-8') as out_file:
            out_file.write(self.CDX_DELIMINATOR.join(fields_strs))
            out_file.write('\n')

    @classmethod
    def parse_mimetype(cls, value):
        '''Return the MIME type from a Content-Type string.

        Returns:
            str, None: A string in the form ``type/subtype`` or None.
        '''
        match = re.match(r'([a-zA-Z0-9-]+/[a-zA-Z0-9-]+)', value)

        if match:
            return match.group(1)
Beispiel #15
0
class WARCRecorder(object):
    '''Record to WARC file.

    Args:
        filename (str): The filename (without the extension).
        params (:class:`WARCRecorderParams`): Parameters.
    '''
    CDX_DELIMINATOR = ' '
    '''Default CDX delimiter.'''
    DEFAULT_SOFTWARE_STRING = 'Wpull/{0} Python/{1}'.format(
        wpull.version.__version__, wpull.util.python_version())
    '''Default software string.'''
    def __init__(self, filename, params=None):
        self._prefix_filename = filename
        self._params = params or WARCRecorderParams()
        self._warcinfo_record = None
        self._sequence_num = 0
        self._log_temp_file = None
        self._log_handler = None
        self._warc_filename = None
        self._cdx_filename = None

        self._check_journals_and_maybe_raise()

        if params.log:
            self._setup_log()

        self._start_new_warc_file()

        if self._params.cdx:
            self._start_new_cdx_file()

    def _check_journals_and_maybe_raise(self):
        '''Check if any journal files exist and raise an error.'''
        files = list(glob.glob(self._prefix_filename + '*-wpullinc'))

        if files:
            raise OSError('WARC file {} is incomplete.'.format(files[0]))

    def _start_new_warc_file(self, meta=False):
        '''Create and set as current WARC file.'''
        if self._params.max_size and not meta and self._params.appending:
            while True:
                self._warc_filename = self._generate_warc_filename()

                if os.path.exists(self._warc_filename):
                    _logger.debug('Skip {0}', self._warc_filename)
                    self._sequence_num += 1
                else:
                    break
        else:
            self._warc_filename = self._generate_warc_filename(meta=meta)

        _logger.debug('WARC file at {0}', self._warc_filename)

        if not self._params.appending:
            wpull.util.truncate_file(self._warc_filename)

        self._warcinfo_record = WARCRecord()
        self._populate_warcinfo(self._params.extra_fields)
        self.write_record(self._warcinfo_record)

    def _generate_warc_filename(self, meta=False):
        '''Return a suitable WARC filename.'''
        if self._params.max_size is None:
            sequence_name = ''
        elif meta:
            sequence_name = '-meta'
        else:
            sequence_name = '-{0:05d}'.format(self._sequence_num)

        if self._params.compress:
            extension = 'warc.gz'
        else:
            extension = 'warc'

        return '{0}{1}.{2}'.format(self._prefix_filename, sequence_name,
                                   extension)

    def _start_new_cdx_file(self):
        '''Create and set current CDX file.'''
        self._cdx_filename = '{0}.cdx'.format(self._prefix_filename)

        if not self._params.appending:
            wpull.util.truncate_file(self._cdx_filename)
            self._write_cdx_header()
        elif not os.path.exists(self._cdx_filename):
            self._write_cdx_header()

    def _populate_warcinfo(self, extra_fields=None):
        '''Add the metadata to the Warcinfo record.'''
        self._warcinfo_record.set_common_fields(WARCRecord.WARCINFO,
                                                WARCRecord.WARC_FIELDS)

        info_fields = NameValueRecord(wrap_width=1024)
        info_fields['Software'] = self._params.software_string \
            or self.DEFAULT_SOFTWARE_STRING
        info_fields['format'] = 'WARC File Format 1.0'
        info_fields['conformsTo'] = \
            'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf'

        if extra_fields:
            for name, value in extra_fields:
                info_fields.add(name, value)

        self._warcinfo_record.block_file = io.BytesIO(
            bytes(info_fields) + b'\r\n')
        self._warcinfo_record.compute_checksum()

    def _setup_log(self):
        '''Set up the logging file.'''
        logger = logging.getLogger()
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        self._log_temp_file = NamedTemporaryFile(
            prefix='tmp-wpull-warc-',
            dir=self._params.temp_dir,
            suffix='.log.gz',
            delete=False,
        )
        self._log_temp_file.close()  # For Windows

        self._log_handler = handler = logging.StreamHandler(
            io.TextIOWrapper(gzip.GzipFile(filename=self._log_temp_file.name,
                                           mode='wb'),
                             encoding='utf-8'))

        logger.setLevel(logging.DEBUG)
        logger.debug('Wpull needs the root logger level set to DEBUG.')

        handler.setFormatter(formatter)
        logger.addHandler(handler)
        handler.setLevel(logging.INFO)

    def listen_to_http_client(self, client: HTTPClient):
        client.event_dispatcher.add_listener(
            HTTPClient.ClientEvent.new_session, self._http_session_callback)

    def _http_session_callback(self, http_session: HTTPSession):
        recorder_session = self.new_http_recorder_session()

        http_session.event_dispatcher.add_listener(
            HTTPSession.Event.begin_request, recorder_session.begin_request)
        http_session.event_dispatcher.add_listener(
            HTTPSession.Event.request_data, recorder_session.request_data)
        http_session.event_dispatcher.add_listener(
            HTTPSession.Event.end_request, recorder_session.end_request)
        http_session.event_dispatcher.add_listener(
            HTTPSession.Event.begin_response, recorder_session.begin_response)
        http_session.event_dispatcher.add_listener(
            HTTPSession.Event.response_data, recorder_session.response_data)
        http_session.event_dispatcher.add_listener(
            HTTPSession.Event.end_response, recorder_session.end_response)

        http_session.event_dispatcher.add_listener(
            HTTPSession.SessionEvent.end_session,
            lambda error: recorder_session.close())

    def new_http_recorder_session(self) -> 'HTTPWARCRecorderSession':
        return HTTPWARCRecorderSession(self,
                                       temp_dir=self._params.temp_dir,
                                       url_table=self._params.url_table)

    def listen_to_ftp_client(self, client: FTPClient):
        client.event_dispatcher.add_listener(FTPClient.ClientEvent.new_session,
                                             self._ftp_session_callback)

    def _ftp_session_callback(self, ftp_session: FTPSession):
        recorder_session = self.new_ftp_recorder_session()

        ftp_session.event_dispatcher.add_listener(
            FTPSession.Event.begin_control, recorder_session.begin_control)
        ftp_session.event_dispatcher.add_listener(
            FTPSession.Event.control_receive_data,
            recorder_session.control_receive_data)
        ftp_session.event_dispatcher.add_listener(
            FTPSession.Event.control_send_data,
            recorder_session.control_send_data)
        ftp_session.event_dispatcher.add_listener(FTPSession.Event.end_control,
                                                  recorder_session.end_control)

        ftp_session.event_dispatcher.add_listener(
            FTPSession.Event.begin_transfer, recorder_session.begin_transfer)
        ftp_session.event_dispatcher.add_listener(
            FTPSession.Event.transfer_receive_data,
            recorder_session.transfer_receive_data)
        ftp_session.event_dispatcher.add_listener(
            FTPSession.Event.end_transfer, recorder_session.end_transfer)

        ftp_session.event_dispatcher.add_listener(
            FTPSession.SessionEvent.end_session,
            lambda error: recorder_session.close())

    def new_ftp_recorder_session(self) -> 'FTPWARCRecorderSession':
        return FTPWARCRecorderSession(self,
                                      temp_dir=self._params.temp_dir,
                                      url_table=self._params.url_table)

    def flush_session(self):
        if self._params.max_size is not None \
           and os.path.getsize(self._warc_filename) > self._params.max_size:
            self._sequence_num += 1

            if self._params.move_to is not None:
                self._move_file_to_dest_dir(self._warc_filename)

            _logger.debug('Starting new warc file due to max size.')
            self._start_new_warc_file()

    def _move_file_to_dest_dir(self, filename):
        '''Move the file to the ``move_to`` directory.'''
        assert self._params.move_to

        if os.path.isdir(self._params.move_to):
            _logger.debug('Moved {} to {}.', self._warc_filename,
                          self._params.move_to)
            shutil.move(filename, self._params.move_to)
        else:
            _logger.error('{} is not a directory; not moving {}.',
                          self._params.move_to, filename)

    def set_length_and_maybe_checksums(self, record, payload_offset=None):
        '''Set the content length and possibly the checksums.'''
        if self._params.digests:
            record.compute_checksum(payload_offset)
        else:
            record.set_content_length()

    def write_record(self, record):
        '''Append the record to the WARC file.'''
        # FIXME: probably not a good idea to modifiy arguments passed to us
        # TODO: add extra gzip headers that wget uses
        record.fields['WARC-Warcinfo-ID'] = self._warcinfo_record.fields[
            WARCRecord.WARC_RECORD_ID]

        _logger.debug('Writing WARC record {0}.', record.fields['WARC-Type'])

        if self._params.compress:
            open_func = gzip.GzipFile
        else:
            open_func = open

        # Use getsize to get actual file size. Avoid tell() because it may
        # not be the raw file position.
        if os.path.exists(self._warc_filename):
            before_offset = os.path.getsize(self._warc_filename)
        else:
            before_offset = 0

        journal_filename = self._warc_filename + '-wpullinc'

        with open(journal_filename, 'w') as file:
            file.write('wpull-journal-version:1\n')
            file.write('offset:{}\n'.format(before_offset))

        try:
            with open_func(self._warc_filename, mode='ab') as out_file:
                for data in record:
                    out_file.write(data)
        except (OSError, IOError) as error:
            _logger.info(_('Rolling back file {filename} to length {length}.'),
                         filename=self._warc_filename,
                         length=before_offset)
            with open(self._warc_filename, mode='wb') as out_file:
                out_file.truncate(before_offset)

            raise error
        finally:
            os.remove(journal_filename)

        after_offset = os.path.getsize(self._warc_filename)

        if self._cdx_filename:
            raw_file_offset = before_offset
            raw_file_record_size = after_offset - before_offset

            self._write_cdx_field(record, raw_file_record_size,
                                  raw_file_offset)

    def close(self):
        '''Close the WARC file and clean up any logging handlers.'''
        if self._log_temp_file:
            self._log_handler.flush()

            logger = logging.getLogger()
            logger.removeHandler(self._log_handler)
            self._log_handler.stream.close()

            log_record = WARCRecord()
            log_record.block_file = gzip.GzipFile(
                filename=self._log_temp_file.name)
            log_record.set_common_fields('resource', 'text/plain')

            log_record.fields['WARC-Target-URI'] = \
                'urn:X-wpull:log'

            if self._params.max_size is not None:
                if self._params.move_to is not None:
                    self._move_file_to_dest_dir(self._warc_filename)

                self._start_new_warc_file(meta=True)

            self.set_length_and_maybe_checksums(log_record)
            self.write_record(log_record)

            log_record.block_file.close()

            try:
                os.remove(self._log_temp_file.name)
            except OSError:
                _logger.exception('Could not close log temp file.')

            self._log_temp_file = None

            self._log_handler.close()
            self._log_handler = None

            if self._params.move_to is not None:
                self._move_file_to_dest_dir(self._warc_filename)

        if self._cdx_filename and self._params.move_to is not None:
            self._move_file_to_dest_dir(self._cdx_filename)

    def _write_cdx_header(self):
        '''Write the CDX header.

        It writes the fields:

        1. a: original URL
        2. b: UNIX timestamp
        3. m: MIME Type from the HTTP Content-type
        4. s: response code
        5. k: new style checksum
        6. S: raw file record size
        7. V: offset in raw file
        8. g: filename of raw file
        9. u: record ID
        '''
        with open(self._cdx_filename, mode='a', encoding='utf-8') as out_file:
            out_file.write(self.CDX_DELIMINATOR)
            out_file.write(
                self.CDX_DELIMINATOR.join(
                    ('CDX', 'a', 'b', 'm', 's', 'k', 'S', 'V', 'g', 'u')))
            out_file.write('\n')

    def _write_cdx_field(self, record, raw_file_record_size, raw_file_offset):
        '''Write the CDX field if needed.'''
        if record.fields[WARCRecord.WARC_TYPE] != WARCRecord.RESPONSE \
           or not re.match(r'application/http; *msgtype *= *response',
                           record.fields[WARCRecord.CONTENT_TYPE]):
            return

        url = record.fields['WARC-Target-URI']

        _logger.debug('Writing CDX record {0}.', url)

        http_header = record.get_http_header()

        if http_header:
            mime_type = self.parse_mimetype(
                http_header.fields.get('Content-Type', '')) or '-'
            response_code = str(http_header.status_code)
        else:
            mime_type = '-'
            response_code = '-'

        timestamp = str(
            int(
                wpull.util.parse_iso8601_str(
                    record.fields[WARCRecord.WARC_DATE])))

        checksum = record.fields.get('WARC-Payload-Digest', '')

        if checksum.startswith('sha1:'):
            checksum = checksum.replace('sha1:', '', 1)
        else:
            checksum = '-'

        raw_file_record_size_str = str(raw_file_record_size)
        raw_file_offset_str = str(raw_file_offset)
        filename = os.path.basename(self._warc_filename)
        record_id = record.fields[WARCRecord.WARC_RECORD_ID]
        fields_strs = (url, timestamp, mime_type, response_code, checksum,
                       raw_file_record_size_str, raw_file_offset_str, filename,
                       record_id)

        with open(self._cdx_filename, mode='a', encoding='utf-8') as out_file:
            out_file.write(self.CDX_DELIMINATOR.join(fields_strs))
            out_file.write('\n')

    @classmethod
    def parse_mimetype(cls, value):
        '''Return the MIME type from a Content-Type string.

        Returns:
            str, None: A string in the form ``type/subtype`` or None.
        '''
        match = re.match(r'([a-zA-Z0-9-]+/[a-zA-Z0-9-]+)', value)

        if match:
            return match.group(1)