def _write_warc_metadata(self): '''Write the JSON metadata to WARC. Uses pywb spec. ''' uri = 'metadata://{}{}'.format(self._url_item.url_info.authority, self._url_item.url_info.resource) glob_pattern = self._path_prefix + '*.info.json' filenames = list(glob.glob(glob_pattern)) if not filenames: _logger.warning( __(_( 'Could not find external process metadata file: {filename}' ), filename=glob_pattern)) return for filename in filenames: record = WARCRecord() record.set_common_fields( 'metadata', 'application/vnd.youtube-dl_formats+json') record.fields['WARC-Target-URI'] = uri record.block_file = open(filename, 'rb') self._warc_recorder.set_length_and_maybe_checksums(record) self._warc_recorder.write_record(record) record.block_file.close()
def _add_warc_snapshot(self, filename, url): '''Add the snaphot to the WARC file.''' _logger.debug('Adding snapshot record.') extension = os.path.splitext(filename)[1] content_type = { '.pdf': 'application/pdf', '.html': 'text/html', '.png': 'image/png', '.gif': 'image/gif' }[extension] record = WARCRecord() record.set_common_fields('resource', content_type) record.fields['WARC-Target-URI'] = 'urn:X-wpull:snapshot?url={0}' \ .format(wpull.url.percent_encode_query_value(url)) if self._action_warc_record: record.fields['WARC-Concurrent-To'] = \ self._action_warc_record.fields[WARCRecord.WARC_RECORD_ID] with open(filename, 'rb') as in_file: record.block_file = in_file self._warc_recorder.set_length_and_maybe_checksums(record) self._warc_recorder.write_record(record)
def _write_warc_metadata(self): '''Write the JSON metadata to WARC. Uses pywb spec. ''' uri = 'metadata://{}{}'.format(self._url_item.url_info.authority, self._url_item.url_info.resource) glob_pattern = self._path_prefix + '*.info.json' filenames = list(glob.glob(glob_pattern)) if not filenames: _logger.warning(__( _('Could not find external process metadata file: {filename}'), filename=glob_pattern )) return for filename in filenames: record = WARCRecord() record.set_common_fields('metadata', 'application/vnd.youtube-dl_formats+json') record.fields['WARC-Target-URI'] = uri record.block_file = open(filename, 'rb') self._warc_recorder.set_length_and_maybe_checksums(record) self._warc_recorder.write_record(record) record.block_file.close()
def _add_warc_snapshot(self, filename, content_type, url): _logger.debug('Adding snapshot record.') record = WARCRecord() record.set_common_fields('resource', content_type) record.fields['WARC-Target-URI'] = 'urn:X-wpull:snapshot?url={0}'\ .format(wpull.url.quote(url)) with open(filename, 'rb') as in_file: record.block_file = in_file self._warc_recorder.set_length_and_maybe_checksums(record) self._warc_recorder.write_record(record)
def _add_warc_snapshot(self, filename, content_type, url): '''Add the snaphot to the WARC file.''' _logger.debug('Adding snapshot record.') record = WARCRecord() record.set_common_fields('resource', content_type) record.fields['WARC-Target-URI'] = 'urn:X-wpull:snapshot?url={0}'\ .format(wpull.url.quote(url)) if self._action_warc_record: record.fields['WARC-Concurrent-To'] = \ self._action_warc_record.fields[WARCRecord.WARC_RECORD_ID] with open(filename, 'rb') as in_file: record.block_file = in_file self._warc_recorder.set_length_and_maybe_checksums(record) self._warc_recorder.write_record(record)
def close(self): '''Close the WARC file and clean up any logging handlers.''' if self._log_temp_file: self._log_handler.flush() logger = logging.getLogger() logger.removeHandler(self._log_handler) self._log_handler.stream.close() log_record = WARCRecord() log_record.block_file = gzip.GzipFile( filename=self._log_temp_file.name ) log_record.set_common_fields('resource', 'text/plain') log_record.fields['WARC-Target-URI'] = \ 'urn:X-wpull:log' if self._params.max_size is not None: if self._params.move_to is not None: self._move_file_to_dest_dir(self._warc_filename) self._start_new_warc_file(meta=True) self.set_length_and_maybe_checksums(log_record) self.write_record(log_record) log_record.block_file.close() try: os.remove(self._log_temp_file.name) except OSError: _logger.exception('Could not close log temp file.') self._log_temp_file = None self._log_handler.close() self._log_handler = None if self._params.move_to is not None: self._move_file_to_dest_dir(self._warc_filename) if self._cdx_filename and self._params.move_to is not None: self._move_file_to_dest_dir(self._cdx_filename)
def close(self): '''Close the WARC file and clean up any logging handlers.''' if self._log_temp_file: self._log_handler.flush() logger = logging.getLogger() logger.removeHandler(self._log_handler) self._log_handler.stream.close() log_record = WARCRecord() log_record.block_file = gzip.GzipFile( filename=self._log_temp_file.name) log_record.set_common_fields('resource', 'text/plain') log_record.fields['WARC-Target-URI'] = \ 'urn:X-wpull:log' if self._params.max_size is not None: if self._params.move_to is not None: self._move_file_to_dest_dir(self._warc_filename) self._start_new_warc_file(meta=True) self.set_length_and_maybe_checksums(log_record) self.write_record(log_record) log_record.block_file.close() try: os.remove(self._log_temp_file.name) except OSError: _logger.exception('Could not close log temp file.') self._log_temp_file = None self._log_handler.close() self._log_handler = None if self._params.move_to is not None: self._move_file_to_dest_dir(self._warc_filename) if self._cdx_filename and self._params.move_to is not None: self._move_file_to_dest_dir(self._cdx_filename)