def process(record, out, options): ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else () if options.decode_http: if record.type == WarcRecord.RESPONSE: content_type, content = record.content message = None if content_type == ResponseMessage.CONTENT_TYPE: # technically, a http request needs to know the request to be parsed # because responses to head requests don't have a body. # we assume we don't store 'head' responses, and plough on message = ResponseMessage(RequestMessage(), ignore_headers=ignore_headers) if content_type == RequestMessage.CONTENT_TYPE: message = RequestMessage(ignore_headers=ignore_headers) if message: leftover = message.feed(content) message.close() if not leftover and message.complete(): content = message.get_decoded_message() record.content = content_type, content else: error = [] if leftover: error.append("%d bytes unparsed"%len(leftover)) if not message.complete(): error.append("incomplete message (at %s, %s)"%(message.mode, message.header.mode)) print >> sys.stderr, 'errors decoding http in record', record.id, ",".join(error) record.write_to(out, gzip=options.gzip)
def parse_http_response_charset(record): """Parses the payload of an HTTP 'response' record, returning code, content type, declared character set and body. Adapted from github's internetarchive/warctools hanzo/warcfilter.py, commit 1850f328e31e505569126b4739cec62ffa444223. MIT licenced.""" message = ResponseMessage(RequestMessage()) remainder = message.feed(record.content[1]) message.close() if remainder or not message.complete(): if remainder: print 'trailing data in http response for', record.url if not message.complete(): print 'truncated http response for', record.url header = message.header mime_type = [v for k,v in header.headers if k.lower() == b'content-type'] charset = None if mime_type: match = re.search(r'charset=(\S+)', mime_type[0], re.I) if match: charset = match.group(1).lower() mime_type = mime_type[0].split(b';')[0] else: mime_type = None return header.code, mime_type, charset, message.get_body()
def process(record, out, options): ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else () if options.decode_http: if record.type == WarcRecord.RESPONSE: content_type, content = record.content message = None if content_type == ResponseMessage.CONTENT_TYPE: # technically, a http request needs to know the request to be parsed # because responses to head requests don't have a body. # we assume we don't store 'head' responses, and plough on message = ResponseMessage(RequestMessage(), ignore_headers=ignore_headers) if content_type == RequestMessage.CONTENT_TYPE: message = RequestMessage(ignore_headers=ignore_headers) if message: leftover = message.feed(content) message.close() if not leftover and message.complete(): content = message.get_decoded_message() record.content = content_type, content else: error = [] if leftover: error.append("%d bytes unparsed" % len(leftover)) if not message.complete(): error.append("incomplete message (at %s, %s)" % (message.mode, message.header.mode)) print >> sys.stderr, 'errors decoding http in record', record.id, ",".join( error) record.write_to(out, gzip=options.gzip)
def process(record, previous_record, out, options, found_hrefs): ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else () if options.decode_http: if record.type == WarcRecord.RESPONSE: content_type, content = record.content message = None if content_type == ResponseMessage.CONTENT_TYPE: # technically, a http request needs to know the request to be parsed # because responses to head requests don't have a body. # we assume we don't store 'head' responses, and plough on message = ResponseMessage(RequestMessage(), ignore_headers=ignore_headers) if content_type == RequestMessage.CONTENT_TYPE: message = RequestMessage(ignore_headers=ignore_headers) if message: leftover = message.feed(content) message.close() ##print "Code", message.header.code if not leftover and message.complete(): content = message.get_decoded_message() if found_hrefs is not None and message.header.code == 200: found_hrefs.update(match[12:-2] for match in JSON_HREF_RE.findall(content)) record.content = content_type, content else: error = [] if leftover: error.append("%d bytes unparsed"%len(leftover)) if not message.complete(): error.append("incomplete message (at %s, %s)"%(message.mode, message.header.mode)) print >> sys.stderr, 'errors decoding http in record', record.id, ",".join(error) if options.strip_404s: # We don't write out a request until we confirm its associated response is not 404 if record.type == WarcRecord.REQUEST: pass elif record.type == WarcRecord.RESPONSE: if message.header.code == 404: # If 404, don't write out either the request or the response pass else: if previous_record is None: raise RuntimeError("Need to write out previous record as well, but it isn't present") if previous_record.type != WarcRecord.REQUEST: raise RuntimeError("Expected previous record to be a " "WarcRecord.REQUEST, was a %r" % (previous_record.type,)) # Note that if a request is made multiple times, we will only write out the last # attempt at it. previous_record.write_to(out, gzip=options.gzip) record.write_to(out, gzip=options.gzip) else: # metadata record.write_to(out, gzip=options.gzip) else: record.write_to(out, gzip=options.gzip)
def parse_http_response(record): message = ResponseMessage(RequestMessage()) remainder = message.feed(record.content[1]) message.close() if remainder or not message.complete(): if remainder: print >> sys.stderr, 'warning: trailing data in http response for', record.url if not message.complete(): print >> sys.stderr, 'warning: truncated http response for', record.url header = message.header mime_type = [v for k, v in header.headers if k.lower() == 'content-type'] if mime_type: mime_type = mime_type[0].split(';')[0] else: mime_type = None return header.code, mime_type, message
def parse_http_response(record): message = ResponseMessage(RequestMessage()) remainder = message.feed(record.content[1]) message.close() if remainder or not message.complete(): if remainder: print 'trailing data in http response for %s'% record.url if not message.complete(): print 'truncated http response for %s'%record.url header = message.header mime_type = [v for k,v in header.headers if k.lower() =='content-type'] if mime_type: mime_type = mime_type[0].split(';')[0] else: mime_type = None return header.code, mime_type, message
def parse_http_response_charset(record): """Parses the payload of an HTTP 'response' record, returning code, content type and body. Adapted from github's internetarchive/warctools hanzo/warcfilter.py, commit 1850f328e31e505569126b4739cec62ffa444223. MIT licenced.""" message = ResponseMessage(RequestMessage()) remainder = message.feed(record.content[1]) message.close() if remainder or not message.complete(): if remainder: raise Exception('trailing data in http response for'+str(record.url)) if not message.complete(): print Exception('truncated http response for'+str(record.url)) header = message.header mime_type = [v for k,v in header.headers if k.lower() == b'content-type'] if mime_type: mime_type, charset = mime_type[0].split(b';') else: mime_type = None return header.code, mime_type, message.get_body()
def parse_http_response(record): """Parses the payload of an HTTP 'response' record, returning code, content type and body. Adapted from github's internetarchive/warctools hanzo/warcfilter.py, commit 1850f328e31e505569126b4739cec62ffa444223. MIT licenced.""" message = ResponseMessage(RequestMessage()) remainder = message.feed(record.content[1]) message.close() if remainder or not message.complete(): if remainder: print 'trailing data in http response for', record.url if not message.complete(): print 'truncated http response for', record.url header = message.header mime_type = [v for k, v in header.headers if k.lower() == b'content-type'] if mime_type: mime_type = mime_type[0].split(b';')[0] else: mime_type = None return header.code, mime_type, message.get_body()
def process(record, previous_record, out, options, found_hrefs): ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else () if options.decode_http: if record.type == WarcRecord.RESPONSE: content_type, content = record.content message = None if content_type == ResponseMessage.CONTENT_TYPE: # technically, a http request needs to know the request to be parsed # because responses to head requests don't have a body. # we assume we don't store 'head' responses, and plough on message = ResponseMessage(RequestMessage(), ignore_headers=ignore_headers) if content_type == RequestMessage.CONTENT_TYPE: message = RequestMessage(ignore_headers=ignore_headers) if message: leftover = message.feed(content) message.close() ##print "Code", message.header.code if not leftover and message.complete(): content = message.get_decoded_message() if found_hrefs is not None and message.header.code == 200: found_hrefs.update( match[12:-2] for match in JSON_HREF_RE.findall(content)) record.content = content_type, content else: error = [] if leftover: error.append("%d bytes unparsed" % len(leftover)) if not message.complete(): error.append("incomplete message (at %s, %s)" % (message.mode, message.header.mode)) print >> sys.stderr, 'errors decoding http in record', record.id, ",".join( error) if options.strip_404s: # We don't write out a request until we confirm its associated response is not 404 if record.type == WarcRecord.REQUEST: pass elif record.type == WarcRecord.RESPONSE: if message.header.code == 404: # If 404, don't write out either the request or the response pass else: if previous_record is None: raise RuntimeError( "Need to write out previous record as well, but it isn't present" ) if previous_record.type != WarcRecord.REQUEST: raise RuntimeError("Expected previous record to be a " "WarcRecord.REQUEST, was a %r" % (previous_record.type, )) # Note that if a request is made multiple times, we will only write out the last # attempt at it. previous_record.write_to(out, gzip=options.gzip) record.write_to(out, gzip=options.gzip) else: # metadata record.write_to(out, gzip=options.gzip) else: record.write_to(out, gzip=options.gzip)