def parse_warc_record(self, record): """ Parse warc record """ entry = self._create_index_entry(record.rec_type) if record.rec_type == 'warcinfo': entry['url'] = record.rec_headers.get_header('WARC-Filename') entry['urlkey'] = entry['url'] entry['_warcinfo'] = record.stream.read(record.length) return entry entry['url'] = record.rec_headers.get_header('WARC-Target-Uri') # timestamp entry['timestamp'] = iso_date_to_timestamp(record.rec_headers. get_header('WARC-Date')) # mime if record.rec_type == 'revisit': entry['mime'] = 'warc/revisit' elif self.options.get('minimal'): entry['mime'] = '-' else: def_mime = '-' if record.rec_type == 'request' else 'unk' entry.extract_mime(record.status_headers. get_header('Content-Type'), def_mime) # status -- only for response records (by convention): if record.rec_type == 'response' and not self.options.get('minimal'): entry.extract_status(record.status_headers) else: entry['status'] = '-' # digest digest = record.rec_headers.get_header('WARC-Payload-Digest') entry['digest'] = digest if digest and digest.startswith('sha1:'): entry['digest'] = digest[len('sha1:'):] elif not entry.get('digest'): entry['digest'] = '-' # optional json metadata, if present metadata = record.rec_headers.get_header('WARC-Json-Metadata') if metadata: entry['metadata'] = metadata return entry
def _load_different_url_payload(self, cdx, headers_record, failed_files, cdx_loader): """ Handle the case where a duplicate of a capture with same digest exists at a different url. If a cdx_server is provided, a query is made for matching url, timestamp and digest. Raise exception if no matches found. """ ref_target_uri = (headers_record.rec_headers. get_header('WARC-Refers-To-Target-URI')) target_uri = headers_record.rec_headers.get_header('WARC-Target-URI') # if no target uri, no way to find the original if not ref_target_uri: raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG) ref_target_date = (headers_record.rec_headers. get_header('WARC-Refers-To-Date')) if not ref_target_date: ref_target_date = cdx['timestamp'] else: ref_target_date = iso_date_to_timestamp(ref_target_date) digest = cdx.get('digest', '-') try: orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri, ref_target_date, digest, cdx_loader) except NotFoundException: raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG) for orig_cdx in orig_cdx_lines: try: payload_record = self._resolve_path_load(orig_cdx, False, failed_files) return payload_record except ArchiveLoadFailed as e: pass raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG)
def parse_warc_record(self, record): """ Parse warc record """ entry = self._create_index_entry(record.rec_type) if record.rec_type == 'warcinfo': entry['url'] = record.rec_headers.get_header('WARC-Filename') entry['urlkey'] = entry['url'] entry['_warcinfo'] = record.stream.read(record.length) return entry entry['url'] = record.rec_headers.get_header('WARC-Target-Uri') # timestamp entry['timestamp'] = iso_date_to_timestamp( record.rec_headers.get_header('WARC-Date')) # mime if record.rec_type == 'revisit': entry['mime'] = 'warc/revisit' elif self.options.get('minimal'): entry['mime'] = '-' else: def_mime = '-' if record.rec_type == 'request' else 'unk' entry.extract_mime( record.status_headers.get_header('Content-Type'), def_mime) # status -- only for response records (by convention): if record.rec_type == 'response' and not self.options.get('minimal'): entry.extract_status(record.status_headers) else: entry['status'] = '-' # digest digest = record.rec_headers.get_header('WARC-Payload-Digest') entry['digest'] = digest if digest and digest.startswith('sha1:'): entry['digest'] = digest[len('sha1:'):] elif not entry.get('digest'): entry['digest'] = '-' # optional json metadata, if present metadata = record.rec_headers.get_header('WARC-Json-Metadata') if metadata: entry['metadata'] = metadata return entry
def _load_different_url_payload(self, cdx, headers_record, failed_files, cdx_loader): """ Handle the case where a duplicate of a capture with same digest exists at a different url. If a cdx_server is provided, a query is made for matching url, timestamp and digest. Raise exception if no matches found. """ ref_target_uri = (headers_record.rec_headers. get_header('WARC-Refers-To-Target-URI')) target_uri = headers_record.rec_headers.get_header('WARC-Target-URI') # Check for unresolved revisit error, # if refers to target uri not present or same as the current url if not ref_target_uri or (ref_target_uri == target_uri): raise ArchiveLoadFailed('Missing Revisit Original') ref_target_date = (headers_record.rec_headers. get_header('WARC-Refers-To-Date')) if not ref_target_date: ref_target_date = cdx['timestamp'] else: ref_target_date = iso_date_to_timestamp(ref_target_date) digest = cdx['digest'] orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri, ref_target_date, digest, cdx_loader) for cdx in orig_cdx_lines: try: payload_record = self._resolve_path_load(cdx, False, failed_files) return payload_record except ArchiveLoadFailed as e: pass raise ArchiveLoadFailed('Original for revisit could not be loaded')
def parse_warc_record(record): """ Parse warc record """ entry = ArchiveIndexEntry() if record.rec_type == 'warcinfo': entry.url = record.rec_headers.get_header('WARC-Filename') entry.key = entry.url entry.warcinfo = record.stream.read(record.length) return entry entry.url = record.rec_headers.get_header('WARC-Target-Uri') # timestamp entry.timestamp = iso_date_to_timestamp(record.rec_headers. get_header('WARC-Date')) # mime if record.rec_type == 'revisit': entry.mime = 'warc/revisit' else: def_mime = '-' if record.rec_type == 'request' else 'unk' entry.extract_mime(record.status_headers. get_header('Content-Type'), def_mime) # status -- only for response records (by convention): if record.rec_type == 'response': entry.extract_status(record.status_headers) else: entry.status = '-' # digest entry.digest = record.rec_headers.get_header('WARC-Payload-Digest') if entry.digest and entry.digest.startswith('sha1:'): entry.digest = entry.digest[len('sha1:'):] if not entry.digest: entry.digest = '-' return entry
def parse_warc_record(record): """ Parse warc record """ entry = ArchiveIndexEntry() if record.rec_type == 'warcinfo': entry.url = record.rec_headers.get_header('WARC-Filename') entry.key = entry.url entry.warcinfo = record.stream.read(record.length) return entry entry.url = record.rec_headers.get_header('WARC-Target-Uri') # timestamp entry.timestamp = iso_date_to_timestamp( record.rec_headers.get_header('WARC-Date')) # mime if record.rec_type == 'revisit': entry.mime = 'warc/revisit' else: def_mime = '-' if record.rec_type == 'request' else 'unk' entry.extract_mime(record.status_headers.get_header('Content-Type'), def_mime) # status -- only for response records (by convention): if record.rec_type == 'response': entry.extract_status(record.status_headers) else: entry.status = '-' # digest entry.digest = record.rec_headers.get_header('WARC-Payload-Digest') if entry.digest and entry.digest.startswith('sha1:'): entry.digest = entry.digest[len('sha1:'):] if not entry.digest: entry.digest = '-' return entry
def lookup_revisit(self, params, digest, url, iso_dt): params['url'] = url params['closest'] = iso_date_to_timestamp(iso_dt) filters = [] filters.append('!mime:warc/revisit') if digest and digest != '-': filters.append('digest:' + digest.split(':')[-1]) params['filter'] = filters cdx_iter, errs = self.cdx_lookup(params) for cdx in cdx_iter: res = self.dupe_policy(cdx, params) if res: return res return None
def write_snapshot(self, user, coll, url, title, html_text, referrer, user_agent, browser=None): snap_title = 'Static Snapshots' snap_rec = self.sanitize_title(snap_title) if not self.manager.has_recording(user, coll, snap_rec): recording = self.manager.create_recording(user, coll, snap_rec, snap_title) kwargs = dict(user=user, coll=quote(coll), rec=quote(snap_rec, safe='/*'), type='snapshot') params = {'url': url} upstream_url = self.manager.content_app.get_upstream_url( '', kwargs, params) headers = { 'Content-Type': 'text/html; charset=utf-8', 'WARC-User-Agent': user_agent, 'WARC-Referer': referrer, } r = requests.put( upstream_url, data=BytesIO(html_text.encode('utf-8')), headers=headers, ) try: res = r.json() if res['success'] != 'true': print(res) return {'error_message': 'Snapshot Failed'} warc_date = res.get('WARC-Date') except Exception as e: print(e) return {'error_message': 'Snapshot Failed'} if not title: return {'snapshot': ''} if warc_date: timestamp = iso_date_to_timestamp(warc_date) else: timestamp = timestamp_now() page_data = { 'url': url, 'title': title, 'timestamp': timestamp, 'tags': ['snapshot'], } if browser: page_data['browser'] = browser res = self.manager.add_page(user, coll, snap_rec, page_data) return {'snapshot': page_data}