Example #1
0
def process(record, out, options):
    ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else ()
    if options.decode_http:
        if record.type == WarcRecord.RESPONSE:
            content_type, content = record.content
            message = None
            if content_type == ResponseMessage.CONTENT_TYPE:
                # technically, a http request needs to know the request to be parsed
                # because responses to head requests don't have a body.
                # we assume we don't store 'head' responses, and plough on
                message = ResponseMessage(RequestMessage(),
                                          ignore_headers=ignore_headers)
            if content_type == RequestMessage.CONTENT_TYPE:
                message = RequestMessage(ignore_headers=ignore_headers)

            if message:
                leftover = message.feed(content)
                message.close()
                if not leftover and message.complete():
                    content = message.get_decoded_message()
                    record.content = content_type, content
                else:
                    error = []
                    if leftover:
                        error.append("%d bytes unparsed" % len(leftover))
                    if not message.complete():
                        error.append("incomplete message (at %s, %s)" %
                                     (message.mode, message.header.mode))
                    print >> sys.stderr, 'errors decoding http in record', record.id, ",".join(
                        error)

    record.write_to(out, gzip=options.gzip)
Example #2
0
def process(record, out, options):
    ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else ()
    if options.decode_http:
        if record.type == WarcRecord.RESPONSE:
            content_type, content = record.content
            message = None
            if content_type == ResponseMessage.CONTENT_TYPE:
                # technically, a http request needs to know the request to be parsed
                # because responses to head requests don't have a body.
                # we assume we don't store 'head' responses, and plough on 
                message = ResponseMessage(RequestMessage(), ignore_headers=ignore_headers)
            if content_type == RequestMessage.CONTENT_TYPE:
                message = RequestMessage(ignore_headers=ignore_headers)

            if message:
                leftover = message.feed(content)
                message.close()
                if not leftover and message.complete():
                    content = message.get_decoded_message()
                    record.content = content_type, content
                else:
                    error = []
                    if leftover:
                        error.append("%d bytes unparsed"%len(leftover))
                    if not message.complete():
                        error.append("incomplete message (at %s, %s)"%(message.mode, message.header.mode))
                    print >> sys.stderr, 'errors decoding http in record', record.id, ",".join(error)

    record.write_to(out, gzip=options.gzip)
def process(record, previous_record, out, options, found_hrefs):
	ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else ()
	if options.decode_http:
		if record.type == WarcRecord.RESPONSE:
			content_type, content = record.content

			message = None
			if content_type == ResponseMessage.CONTENT_TYPE:
				# technically, a http request needs to know the request to be parsed
				# because responses to head requests don't have a body.
				# we assume we don't store 'head' responses, and plough on 
				message = ResponseMessage(RequestMessage(), ignore_headers=ignore_headers)
			if content_type == RequestMessage.CONTENT_TYPE:
				message = RequestMessage(ignore_headers=ignore_headers)

			if message:
				leftover = message.feed(content)
				message.close()
				##print "Code", message.header.code

				if not leftover and message.complete():
					content = message.get_decoded_message()

					if found_hrefs is not None and message.header.code == 200:
						found_hrefs.update(match[12:-2] for match in JSON_HREF_RE.findall(content))

					record.content = content_type, content
				else:
					error = []
					if leftover:
						error.append("%d bytes unparsed"%len(leftover))
					if not message.complete():
						error.append("incomplete message (at %s, %s)"%(message.mode, message.header.mode))
					print >> sys.stderr, 'errors decoding http in record', record.id, ",".join(error)

	if options.strip_404s:
		# We don't write out a request until we confirm its associated response is not 404
		if record.type == WarcRecord.REQUEST:
			pass
		elif record.type == WarcRecord.RESPONSE:
			if message.header.code == 404:
				# If 404, don't write out either the request or the response
				pass
			else:
				if previous_record is None:
					raise RuntimeError("Need to write out previous record as well, but it isn't present")
				if previous_record.type != WarcRecord.REQUEST:
					raise RuntimeError("Expected previous record to be a "
						"WarcRecord.REQUEST, was a %r" % (previous_record.type,))
				# Note that if a request is made multiple times, we will only write out the last
				# attempt at it.
				previous_record.write_to(out, gzip=options.gzip)
				record.write_to(out, gzip=options.gzip)
		else: # metadata
			record.write_to(out, gzip=options.gzip)
	else:
		record.write_to(out, gzip=options.gzip)
Example #4
0
def process(record, previous_record, out, options, found_hrefs):
    ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else ()
    if options.decode_http:
        if record.type == WarcRecord.RESPONSE:
            content_type, content = record.content

            message = None
            if content_type == ResponseMessage.CONTENT_TYPE:
                # technically, a http request needs to know the request to be parsed
                # because responses to head requests don't have a body.
                # we assume we don't store 'head' responses, and plough on
                message = ResponseMessage(RequestMessage(),
                                          ignore_headers=ignore_headers)
            if content_type == RequestMessage.CONTENT_TYPE:
                message = RequestMessage(ignore_headers=ignore_headers)

            if message:
                leftover = message.feed(content)
                message.close()
                ##print "Code", message.header.code

                if not leftover and message.complete():
                    content = message.get_decoded_message()

                    if found_hrefs is not None and message.header.code == 200:
                        found_hrefs.update(
                            match[12:-2]
                            for match in JSON_HREF_RE.findall(content))

                    record.content = content_type, content
                else:
                    error = []
                    if leftover:
                        error.append("%d bytes unparsed" % len(leftover))
                    if not message.complete():
                        error.append("incomplete message (at %s, %s)" %
                                     (message.mode, message.header.mode))
                    print >> sys.stderr, 'errors decoding http in record', record.id, ",".join(
                        error)

    if options.strip_404s:
        # We don't write out a request until we confirm its associated response is not 404
        if record.type == WarcRecord.REQUEST:
            pass
        elif record.type == WarcRecord.RESPONSE:
            if message.header.code == 404:
                # If 404, don't write out either the request or the response
                pass
            else:
                if previous_record is None:
                    raise RuntimeError(
                        "Need to write out previous record as well, but it isn't present"
                    )
                if previous_record.type != WarcRecord.REQUEST:
                    raise RuntimeError("Expected previous record to be a "
                                       "WarcRecord.REQUEST, was a %r" %
                                       (previous_record.type, ))
                # Note that if a request is made multiple times, we will only write out the last
                # attempt at it.
                previous_record.write_to(out, gzip=options.gzip)
                record.write_to(out, gzip=options.gzip)
        else:  # metadata
            record.write_to(out, gzip=options.gzip)
    else:
        record.write_to(out, gzip=options.gzip)