def process(record, out, options): ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else () if options.decode_http: if record.type == WarcRecord.RESPONSE: content_type, content = record.content message = None if content_type == ResponseMessage.CONTENT_TYPE: # technically, a http request needs to know the request to be parsed # because responses to head requests don't have a body. # we assume we don't store 'head' responses, and plough on message = ResponseMessage(RequestMessage(), ignore_headers=ignore_headers) if content_type == RequestMessage.CONTENT_TYPE: message = RequestMessage(ignore_headers=ignore_headers) if message: leftover = message.feed(content) message.close() if not leftover and message.complete(): content = message.get_decoded_message() record.content = content_type, content else: error = [] if leftover: error.append("%d bytes unparsed" % len(leftover)) if not message.complete(): error.append("incomplete message (at %s, %s)" % (message.mode, message.header.mode)) print >> sys.stderr, 'errors decoding http in record', record.id, ",".join( error) record.write_to(out, gzip=options.gzip)
def writeRecordToTransport(r, t): m = ResponseMessage(RequestMessage()) m.feed(r.content[1]) m.close() b = m.get_body() # construct new headers new_headers = [] old_headers = [] for k, v in m.header.headers: if not k.lower() in ("connection", "content-length", "cache-control", "accept-ranges", "etag", "last-modified", "transfer-encoding"): new_headers.append((k, v)) old_headers.append(("X-Archive-Orig-%s" % k, v)) new_headers.append(("Content-Length", "%d" % len(b))) new_headers.append(("Connection", "keep-alive")) # write the response t.write("%s %d %s\r\n" % (m.header.version, m.header.code, m.header.phrase)) h = new_headers + old_headers t.write("\r\n".join(["%s: %s" % (k, v) for k, v in h])) t.write("\r\n\r\n") t.write(b)
def __call__(self, request): """Called by HTTPServer to execute the request.""" web_match = re.match(self.WEB_RE, request.uri) if not web_match: web_match = re.match(self.WEB_VIA_PROXY_RE, request.uri) if web_match: request.host = "warc" request.uri = web_match.group("uri") request.path, sep, query = request.uri.partition("?") self.web_handler.__call__(request) else: with self.proxy_handler.warc_record_for_uri( canonicalize_url(request.uri)) as record: if record: print "Serving %s from WARC" % request.uri # parse the response message = ResponseMessage(RequestMessage()) message.feed(record[1].content[1]) message.close() body = message.get_body() # construct new headers new_headers = [] old_headers = [] for k, v in message.header.headers: if not k.lower() in ("connection", "content-length", "cache-control", "accept-ranges", "etag", "last-modified", "transfer-encoding"): new_headers.append((k, v)) old_headers.append(("X-Archive-Orig-%s" % k, v)) new_headers.append(("Content-Length", "%d" % len(body))) new_headers.append(("Connection", "keep-alive")) # write the response request.write("%s %d %s\r\n" % (message.header.version, message.header.code, message.header.phrase)) request.write("\r\n".join([ "%s: %s" % (k, v) for k, v in (new_headers + old_headers) ])) request.write("\r\n\r\n") request.write(body) else: print "Could not find %s in WARC" % request.uri request.write( "HTTP/1.0 404 Not Found\r\nConnection: keep-alive\r\nContent-Type: text/plain\r\nContent-Length: 91\r\n\r\nThis URL is not in any of your archives. Close the WARC viewer to resume normal browsing.\r\n" ) request.finish()
def extractPayload(record): """ :type record: WarcRecord """ m = ResponseMessage(RequestMessage()) m.feed(record.content[1]) m.close() b = m.get_body() z = zlib.decompressobj(16 + zlib.MAX_WBITS) try: b = z.decompress(b) except zlib.error: pass return b
def dump_record(fh, outzip): for (offset, record, errors) in fh.read_records(limit=None): if record and record.type == WarcRecord.RESPONSE and record.content[ 0] == ResponseMessage.CONTENT_TYPE: message = ResponseMessage(RequestMessage()) leftover = message.feed(record.content[1]) message.close() outzip.writestr(re.sub(r'^https?://', '', record.url), message.get_body()) print(record.url) elif errors: print >> sys.stderr, "warc errors at %s:%d" % (name, offset if offset else 0) for e in errors: print '\t', e
def parse_http_response(record): message = ResponseMessage(RequestMessage()) remainder = message.feed(record.content[1]) message.close() if remainder or not message.complete(): if remainder: print >> sys.stderr, 'warning: trailing data in http response for', record.url if not message.complete(): print >> sys.stderr, 'warning: truncated http response for', record.url header = message.header mime_type = [v for k, v in header.headers if k.lower() == 'content-type'] if mime_type: mime_type = mime_type[0].split(';')[0] else: mime_type = None return header.code, mime_type, message
def iter_zip(self): with ZipFile(self, "w") as outzip: for (offset, record, errors) in self.archive.read_records(limit=None): if record and record.type == WarcRecord.RESPONSE and re.sub( r'\s+', '', record.content[0]) == ResponseMessage.CONTENT_TYPE: message = ResponseMessage(RequestMessage()) leftover = message.feed(record.content[1]) message.close() filename = self.url_to_filename(record.url) date_time = record.date date_time = (int(date_time[0:4]), int(date_time[5:7]), int(date_time[8:10]), int(date_time[11:13]), int(date_time[14:16]), int(date_time[17:19])) info = ZipInfo(filename, date_time) outzip.writestr(info, message.get_body()) self.files[filename] = record.url for chunk in self.buffer: yield (chunk) self.buffer = [] elif errors: self.errors.append("warc errors at %s:%d" % (name, offset if offset else 0)) for e in errors: self.errors.append(e) outzip.writestr( "files.txt", "\n".join( ["%s -> %s" % (v, k) for k, v in self.files.iteritems()])) if len(self.errors) > 0: outzip.writestr("errors.txt", "\n".join(self.errors)) for chunk in self.buffer: yield (chunk) self.buffer = []
def parse_http_response(record): """Parses the payload of an HTTP 'response' record, returning code, content type and body. Adapted from github's internetarchive/warctools hanzo/warcfilter.py, commit 1850f328e31e505569126b4739cec62ffa444223. MIT licenced.""" message = ResponseMessage(RequestMessage()) remainder = message.feed(record.content[1]) message.close() if remainder or not message.complete(): if remainder: print 'trailing data in http response for', record.url if not message.complete(): print 'truncated http response for', record.url header = message.header mime_type = [v for k, v in header.headers if k.lower() == b'content-type'] if mime_type: mime_type = mime_type[0].split(b';')[0] else: mime_type = None return header.code, mime_type, message.get_body()
def _init_from_warc_record(self, warc_record): self._warc_record = warc_record self.open(QNetworkReply.ReadOnly | QNetworkReply.Unbuffered) self.setUrl(QUrl(str_to_qstring(self._warc_record.url))) rs = ResponseMessage(RequestMessage()) rs.feed(self._warc_record.content[1]) for name, value in rs.header.headers: self.setRawHeader(name, value) self.setAttribute(QNetworkRequest.HttpStatusCodeAttribute, \ rs.header.code) self.setAttribute(QNetworkRequest.HttpReasonPhraseAttribute, \ rs.header.phrase) self._check_for_redirect(rs.header.code) QTimer.singleShot(0, lambda: self.metaDataChanged.emit()) self._data = rs.get_body() QTimer.singleShot(0, lambda: self.readyRead.emit()) QTimer.singleShot(0, lambda: self.finished.emit())
def process(record, previous_record, out, options, found_hrefs): ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else () if options.decode_http: if record.type == WarcRecord.RESPONSE: content_type, content = record.content message = None if content_type == ResponseMessage.CONTENT_TYPE: # technically, a http request needs to know the request to be parsed # because responses to head requests don't have a body. # we assume we don't store 'head' responses, and plough on message = ResponseMessage(RequestMessage(), ignore_headers=ignore_headers) if content_type == RequestMessage.CONTENT_TYPE: message = RequestMessage(ignore_headers=ignore_headers) if message: leftover = message.feed(content) message.close() ##print "Code", message.header.code if not leftover and message.complete(): content = message.get_decoded_message() if found_hrefs is not None and message.header.code == 200: found_hrefs.update( match[12:-2] for match in JSON_HREF_RE.findall(content)) record.content = content_type, content else: error = [] if leftover: error.append("%d bytes unparsed" % len(leftover)) if not message.complete(): error.append("incomplete message (at %s, %s)" % (message.mode, message.header.mode)) print >> sys.stderr, 'errors decoding http in record', record.id, ",".join( error) if options.strip_404s: # We don't write out a request until we confirm its associated response is not 404 if record.type == WarcRecord.REQUEST: pass elif record.type == WarcRecord.RESPONSE: if message.header.code == 404: # If 404, don't write out either the request or the response pass else: if previous_record is None: raise RuntimeError( "Need to write out previous record as well, but it isn't present" ) if previous_record.type != WarcRecord.REQUEST: raise RuntimeError("Expected previous record to be a " "WarcRecord.REQUEST, was a %r" % (previous_record.type, )) # Note that if a request is made multiple times, we will only write out the last # attempt at it. previous_record.write_to(out, gzip=options.gzip) record.write_to(out, gzip=options.gzip) else: # metadata record.write_to(out, gzip=options.gzip) else: record.write_to(out, gzip=options.gzip)