Exemple #1
0
class HTTPResponse(object):

    DOC_TYPE_TEXT_OR_HTML = 'DOC_TYPE_TEXT_OR_HTML'
    DOC_TYPE_SWF = 'DOC_TYPE_SWF'
    DOC_TYPE_PDF = 'DOC_TYPE_PDF'
    DOC_TYPE_IMAGE = 'DOC_TYPE_IMAGE'
    DOC_TYPE_OTHER = 'DOC_TYPE_OTHER'

    def __init__(self,
                 code,
                 read,
                 headers,
                 geturl,
                 original_url,
                 msg='OK',
                 _id=None,
                 time=0.2,
                 alias=None,
                 charset=None):
        '''
        :param code: HTTP code
        :param read: HTTP body text; typically a string
        :param headers: HTTP headers, typically a dict or a httplib.HTTPMessage
        :param geturl: URL object instance
        :param original_url: URL object instance
        :param msg: HTTP message
        :param id: Optional response identifier
        :param time: The time between the request and the response
        :param alias: Alias for the response, this contains a hash that helps
                      the backend sqlite find http_responses faster by indexing
                      by this attr.
        :param charset: Response's encoding; obligatory when `read` is unicode
        '''
        if not isinstance(geturl, URL):
            raise TypeError(
                'Invalid type %s for HTTPResponse ctor param geturl.' %
                type(geturl))

        if not isinstance(original_url, URL):
            raise TypeError(
                'Invalid type %s for HTTPResponse ctor param original_url.' %
                type(original_url))

        if not isinstance(headers, Headers):
            raise TypeError(
                'Invalid type %s for HTTPResponse ctor param headers.' %
                type(headers))

        self._charset = charset
        self._headers = None
        self._body = None
        self._raw_body = read
        self._content_type = None
        self._dom = None
        self._clear_text_body = None
        # A unique id identifier for the response
        self.id = _id
        # From cache defaults to False
        self._from_cache = False
        # Set the info
        self._info = headers
        # Set code
        self.set_code(code)

        # Set the URL variables
        # The URL that we really GET'ed
        self._realurl = original_url.uri2url()
        self._uri = original_url
        # The URL where we were redirected to (equal to original_url
        # when no redirect)
        self._redirected_url = geturl
        self._redirected_uri = geturl.uri2url()

        # Set the rest
        self._msg = msg
        self._time = time
        self._alias = alias
        self._doc_type = None

    @classmethod
    def from_httplib_resp(cls, httplibresp, original_url=None):
        '''
        Factory function. Build a HTTPResponse object from a httplib.HTTPResponse
        instance
    
        :param httplibresp: httplib.HTTPResponse instance
        :param original_url: Optional 'url_object' instance.
    
        :return: A HTTPResponse instance
        '''
        resp = httplibresp
        code, msg, hdrs, body = (resp.code, resp.msg, resp.info(), resp.read())
        hdrs = Headers(hdrs.items())

        if original_url:
            url_inst = URL(resp.geturl(), original_url.encoding)
            url_inst = url_inst.url_decode()
        else:
            url_inst = original_url = URL(resp.geturl())

        charset = getattr(resp, 'encoding', None)
        return cls(code,
                   body,
                   hdrs,
                   url_inst,
                   original_url,
                   msg,
                   charset=charset)

    @classmethod
    def from_dict(cls, unserialized_dict):
        '''
        * msgpack is MUCH faster than cPickle,
        * msgpack can't serialize python objects,
        * I have to create a dict representation of HTTPResponse to serialize it,
        * and a from_dict to have the object back
        
        :param unserialized_dict: A dict just as returned by to_dict()
        '''
        udict = unserialized_dict

        code, msg, hdrs = udict['code'], udict['msg'], udict['headers']
        body, _time, _id = udict['body'], udict['time'], udict['id']

        headers_inst = Headers(hdrs.items())
        url = URL(udict['uri'])

        return cls(code,
                   body,
                   headers_inst,
                   url,
                   url,
                   msg=msg,
                   _id=_id,
                   time=_time)

    def to_dict(self):
        '''
        :return: A dict that represents the current object and is serializable
                 by the json or msgpack modules.
        '''
        serializable_dict = {}
        sdict = serializable_dict

        # Note: The Headers() object can be serialized by msgpack because it
        #       inherits from dict() and doesn't mangle it too much
        sdict['code'], sdict['msg'], sdict['headers'] = (self.get_code(),
                                                         self.get_msg(),
                                                         self.get_headers())
        sdict['body'], sdict['time'], sdict['id'] = (self.get_body(),
                                                     self.get_wait_time(),
                                                     self.get_id())

        sdict['uri'] = self.get_uri().url_string

        return serializable_dict

    def __contains__(self, string_to_test):
        '''
        Determine if the `string_to_test` is contained by the HTTP response
        body.

        :param string_to_test: String to look for in the body
        '''
        return string_to_test in self.body

    def __eq__(self, other):
        return self.id == other.id and self._code == other._code and \
               self.headers == other.headers and self.body == other.body and \
               self._uri == other._uri

    def __repr__(self):

        vals = {
            'code': self.get_code(),
            'url': str(self.get_url()),
            'id': self.id and ' | id:%s' % self.id or '',
            'fcache': self._from_cache and ' | fromCache:True' or ''
        }
        return '<HTTPResponse | %(code)s | %(url)s%(id)s%(fcache)s>' % vals

    def set_id(self, _id):
        self.id = _id

    def get_id(self):
        return self.id

    def set_code(self, code):
        self._code = code

    def get_code(self):
        return self._code

    def get_body(self):
        if self._body is None:
            self._body, self._charset = self._charset_handling()
            # Free 'raw_body'
            self._raw_body = None
        return self._body

    def set_body(self, body):
        '''
        Setter for body.

        @body: A string that represents the body of the HTTP response
        '''
        self._body = None
        self._raw_body = body

    body = property(get_body, set_body)

    def get_clear_text_body(self):
        '''
        :return: A clear text representation of the HTTP response body.
        '''
        clear_text_body = self._clear_text_body

        if clear_text_body is None:

            # Calculate the clear text body
            dom = self.get_dom()
            if dom is not None:
                clear_text_body = ''.join(dom.itertext())
            else:
                clear_text_body = ANY_TAG_MATCH.sub('', self.get_body())

            self._clear_text_body = clear_text_body

        return clear_text_body

    def set_dom(self, dom_inst):
        '''
        This setter is part of a performance improvement I'm talking about in
        get_dom() and sgmlParser._parse().

        Without this set_dom() which is called from sgmlParser._parse() when the
        code runs:
            sgmlParser( http_response )
            ...
            http_response.get_dom()

        The DOM is calculated twice.

        We still need to figure out how to solve the other issue which should
        aim to avoid the double DOM generation when:
            http_response.get_dom()
            ...
            sgmlParser( http_response )

        :return: None
        '''
        self._dom = dom_inst

    def get_dom(self):
        '''
        I don't want to calculate the DOM for all responses, only for those
        which are needed. This method will first calculate the DOM, and then
        save it for upcoming calls.

        @see: TODO: Potential performance improvement in sgmlParser._parse()
                    for ideas on how to reduce CPU usage.

        :return: The DOM, or None if the HTML normalization failed.
        '''
        if self._dom is None:
            try:
                parser = etree.HTMLParser(recover=True)
                self._dom = etree.fromstring(self.body, parser)
            except Exception:
                msg = ('The HTTP body for "%s" could NOT be parsed by lxml.' %
                       self.get_url())
                om.out.debug(msg)
        return self._dom

    def get_charset(self):
        if not self._charset:
            self._body, self._charset = self._charset_handling()
            # Free 'raw_body'
            self._raw_body = None
        return self._charset

    def set_charset(self, charset):
        self._charset = charset

    charset = property(get_charset, set_charset)

    def set_redir_url(self, ru):
        self._redirected_url = ru

    def get_redir_url(self):
        return self._redirected_url

    def set_redir_uri(self, ru):
        self._redirected_uri = ru

    def get_redir_uri(self):
        return self._redirected_uri

    def get_headers(self):
        if self._headers is None:
            self.headers = self._info
            assert self._headers is not None
        return self._headers

    def set_headers(self, headers):
        '''
        Sets the headers and also analyzes them in order to get the response
        mime type (text/html , application/pdf, etc).

        :param headers: The headers dict.
        '''
        # Fix lowercase in header names from HTTPMessage
        if isinstance(headers, httplib.HTTPMessage):
            self._headers = Headers()
            for header in headers.headers:
                key, value = header.split(':', 1)
                self._headers[key.strip()] = value.strip()
        else:
            self._headers = headers

        # Set the type, for easy access.
        self._doc_type = HTTPResponse.DOC_TYPE_OTHER
        find_word = lambda w: content_type.find(w) != -1

        content_type_hvalue, _ = self._headers.iget('content-type', None)

        # we need exactly content type but not charset
        if content_type_hvalue is not None:
            try:
                self._content_type = content_type_hvalue.split(';', 1)[0]
            except:
                msg = 'Invalid Content-Type value "%s" sent in HTTP response.'
                om.out.debug(msg % (content_type_hvalue, ))
            else:
                content_type = self._content_type.lower()

                # Set the doc_type
                if content_type.count('image'):
                    self._doc_type = HTTPResponse.DOC_TYPE_IMAGE

                elif content_type.count('pdf'):
                    self._doc_type = HTTPResponse.DOC_TYPE_PDF

                elif content_type.count('x-shockwave-flash'):
                    self._doc_type = HTTPResponse.DOC_TYPE_SWF

                elif any(
                        imap(find_word,
                             ('text', 'html', 'xml', 'txt', 'javascript'))):
                    self._doc_type = HTTPResponse.DOC_TYPE_TEXT_OR_HTML

    headers = property(get_headers, set_headers)

    def get_lower_case_headers(self):
        '''
        If the original headers were:
            {'Abc-Def': 'F00N3s'}
        This will return:
            {'abc-def': 'F00N3s'}

        The only thing that changes is the header name.
        '''
        lcase_headers = dict(
            (k.lower(), v) for k, v in self.headers.iteritems())
        return Headers(lcase_headers.items())

    def set_url(self, url):
        '''
        >>> url = URL('http://www.google.com')
        >>> r = HTTPResponse(200, '' , Headers(), url, url)
        >>> r.set_url('http://www.google.com/')
        Traceback (most recent call last):
          ...
        TypeError: The URL of a HTTPResponse object must be of url.URL type.
        >>> r.set_url(url)
        >>> r.get_url() == url
        True
        '''
        if not isinstance(url, URL):
            raise TypeError('The URL of a HTTPResponse object must be of '
                            'url.URL type.')

        self._realurl = url.uri2url()

    def get_url(self):
        return self._realurl

    def set_uri(self, uri):
        '''
        >>> uri = URL('http://www.google.com/')
        >>> r = HTTPResponse(200, '' , Headers(), uri, uri)
        >>> r.set_uri('http://www.google.com/')
        Traceback (most recent call last):
          ...
        TypeError: The URI of a HTTPResponse object must be of url.URL type.
        >>> r.set_uri(uri)
        >>> r.get_uri() == uri
        True

        '''
        if not isinstance(uri, URL):
            raise TypeError('The URI of a HTTPResponse object must be of '
                            'url.URL type.')

        self._uri = uri
        self._realurl = uri.uri2url()

    def get_uri(self):
        return self._uri

    def was_redirected(self):
        return self._uri != self._redirected_uri

    def set_from_cache(self, fcache):
        '''
        :param fcache: True if this response was obtained from the
        local cache.
        '''
        self._from_cache = fcache

    def get_from_cache(self):
        '''
        :return: True if this response was obtained from the local cache.
        '''
        return self._from_cache

    def set_wait_time(self, t):
        self._time = t

    def get_wait_time(self):
        return self._time

    def set_alias(self, alias):
        self._alias = alias

    def get_alias(self):
        return self._alias

    def info(self):
        return self._info

    def get_status_line(self):
        '''Return status-line of response.'''
        return 'HTTP/1.1' + SP + str(self._code) + SP + self._msg + CRLF

    def get_msg(self):
        return self._msg

    def _charset_handling(self):
        '''
        Decode the body based on the header (or metadata) encoding.
        The implemented algorithm follows the encoding detection logic
        used by FF:

            1) First try to find a charset using the following search criteria:
                a) Look in the 'content-type' HTTP header. Example:
                    content-type: text/html; charset=iso-8859-1
                b) Look in the 'meta' HTML header. Example:
                    <meta .* content="text/html; charset=utf-8" />
                c) Determine the charset using the chardet module (TODO)
                d) Use the DEFAULT_CHARSET

            2) Try to decode the body using the found charset. If it fails,
            then force it to use the DEFAULT_CHARSET

        Finally return the unicode (decoded) body and the used charset.

        Note: If the body is already a unicode string return it as it is.
        '''
        lcase_headers = self.get_lower_case_headers()
        charset = self._charset
        rawbody = self._raw_body

        # Only try to decode <str> strings. Skip <unicode> strings
        if type(rawbody) is unicode:
            _body = rawbody
            assert charset is not None, (
                "HTTPResponse objects containing "
                "unicode body must have an associated charset")
        elif 'content-type' not in lcase_headers:
            _body = rawbody
            charset = DEFAULT_CHARSET

            if len(_body):
                msg = "The remote web server failed to send the 'content-type'"\
                      " header in HTTP response with id %s" % self.id
                om.out.debug(msg)

        elif not self.is_text_or_html():
            # Not text, save as it is.
            _body = rawbody
            charset = charset or DEFAULT_CHARSET
        else:
            # Figure out charset to work with
            if not charset:
                # Start with the headers
                charset_mo = CHARSET_EXTRACT_RE.search(
                    lcase_headers['content-type'], re.I)
                if charset_mo:
                    # Seems like the response's headers contain a charset
                    charset = charset_mo.groups()[0].lower().strip()
                else:
                    # Continue with the body's meta tag
                    charset_mo = CHARSET_META_RE.search(rawbody, re.IGNORECASE)
                    if charset_mo:
                        charset = charset_mo.groups()[0].lower().strip()
                    else:
                        charset = DEFAULT_CHARSET

            # Now that we have the charset, we use it!
            # The return value of the decode function is a unicode string.
            try:
                _body = smart_unicode(rawbody,
                                      charset,
                                      errors=ESCAPED_CHAR,
                                      on_error_guess=False)
            except LookupError:
                # Warn about a buggy charset
                msg = ('Charset LookupError: unknown charset: %s; '
                       'ignored and set to default: %s' %
                       (charset, self._charset))
                om.out.debug(msg)
                # Forcing it to use the default
                charset = DEFAULT_CHARSET
                _body = smart_unicode(rawbody,
                                      charset,
                                      errors=ESCAPED_CHAR,
                                      on_error_guess=False)

        return _body, charset

    @property
    def content_type(self):
        '''
        The content type of the response
        '''
        if self._content_type is None:
            self.headers = self._info
        return self._content_type or ''

    @property
    def doc_type(self):
        if self._doc_type is None:
            self.headers = self._info
            assert self._doc_type is not None
        return self._doc_type

    def is_text_or_html(self):
        '''
        :return: True if this response is text or html
        '''
        return self.doc_type == HTTPResponse.DOC_TYPE_TEXT_OR_HTML

    def is_pdf(self):
        '''
        :return: True if this response is a PDF file
        '''
        return self.doc_type == HTTPResponse.DOC_TYPE_PDF

    def is_swf(self):
        '''
        :return: True if this response is a SWF file
        '''
        return self.doc_type == HTTPResponse.DOC_TYPE_SWF

    def is_image(self):
        '''
        :return: True if this response is an image file
        '''
        return self.doc_type == HTTPResponse.DOC_TYPE_IMAGE

    def dump_response_head(self):
        '''
        :return: A string with:
            HTTP/1.1 /login.html 200
            Header1: Value1
            Header2: Value2
        '''
        dump_head = "%s%s" % (self.get_status_line(), self.dump_headers())
        if type(dump_head) is unicode:
            dump_head = dump_head.encode(self.charset)
        return dump_head

    def dump(self):
        '''
        Return a DETAILED str representation of this HTTP response object.
        '''
        body = self.body
        # Images, pdf and binary responses in general are never decoded
        # to unicode
        if isinstance(body, unicode):
            body = body.encode(DEFAULT_CHARSET, 'replace')
        return "%s%s%s" % (self.dump_response_head(), CRLF, body)

    def dump_headers(self):
        '''
        :return: a str representation of the headers.
        '''
        if self.headers:
            return CRLF.join(h + ': ' + hv
                             for h, hv in self.headers.items()) + CRLF
        else:
            return ''

    def copy(self):
        return copy.deepcopy(self)
Exemple #2
0
def HTTPRequestParser(head, postdata):
    '''
    This function parses HTTP Requests from a string to a FuzzableRequest.

    :param head: The head of the request.
    :param postdata: The post data of the request
    :return: A FuzzableRequest object with all the corresponding information
        that was sent in head and postdata

    :author: Andres Riancho ([email protected])

    '''
    # Parse the request head, the strip() helps us deal with the \r (if any)
    splitted_head = head.split('\n')
    splitted_head = [h.strip() for h in splitted_head if h]

    if not splitted_head:
        msg = 'The HTTP request is invalid.'
        raise w3afException(msg)

    # Get method, uri, version
    method_uri_version = splitted_head[0]
    first_line = method_uri_version.split(' ')
    if len(first_line) == 3:
        # Ok, we have something like "GET /foo HTTP/1.0". This is the best case for us!
        method, uri, version = first_line
    elif len(first_line) < 3:
        msg = 'The HTTP request has an invalid <method> <uri> <version> token: "'
        msg += method_uri_version + '".'
        raise w3afException(msg)
    elif len(first_line) > 3:
        # GET /hello world.html HTTP/1.0
        # Mostly because we are permissive... we are going to try to parse
        # the request...
        method = first_line[0]
        version = first_line[-1]
        uri = ' '.join(first_line[1:-1])

    check_version_syntax(version)

    # If we got here, we have a nice method, uri, version first line
    # Now we parse the headers (easy!) and finally we send the request
    headers_str = splitted_head[1:]
    headers_inst = Headers()
    for header in headers_str:
        one_splitted_header = header.split(':', 1)
        if len(one_splitted_header) == 1:
            msg = 'The HTTP request has an invalid header: "%s".'
            raise w3afException(msg % header)

        header_name = one_splitted_header[0].strip()
        header_value = one_splitted_header[1].strip()
        if header_name in headers_inst:
            headers_inst[header_name] += ', ' + header_value
        else:
            headers_inst[header_name] = header_value

    host, _ = headers_inst.iget('host', None)
    
    try:
        uri = URL(check_uri_syntax(uri, host))
    except ValueError, ve:
        raise w3afException(str(ve))
Exemple #3
0
class HTTPResponse(object):

    DOC_TYPE_TEXT_OR_HTML = 'DOC_TYPE_TEXT_OR_HTML'
    DOC_TYPE_SWF = 'DOC_TYPE_SWF'
    DOC_TYPE_PDF = 'DOC_TYPE_PDF'
    DOC_TYPE_IMAGE = 'DOC_TYPE_IMAGE'
    DOC_TYPE_OTHER = 'DOC_TYPE_OTHER'

    def __init__(self, code, read, headers, geturl, original_url,
                 msg='OK', _id=None, time=0.2, alias=None, charset=None):
        '''
        :param code: HTTP code
        :param read: HTTP body text; typically a string
        :param headers: HTTP headers, typically a dict or a httplib.HTTPMessage
        :param geturl: URL object instance
        :param original_url: URL object instance
        :param msg: HTTP message
        :param id: Optional response identifier
        :param time: The time between the request and the response
        :param alias: Alias for the response, this contains a hash that helps
                      the backend sqlite find http_responses faster by indexing
                      by this attr.
        :param charset: Response's encoding; obligatory when `read` is unicode
        '''
        if not isinstance(geturl, URL):
            raise TypeError('Invalid type %s for HTTPResponse ctor param geturl.'
                            % type(geturl))

        if not isinstance(original_url, URL):
            raise TypeError('Invalid type %s for HTTPResponse ctor param original_url.'
                            % type(original_url))

        if not isinstance(headers, Headers):
            raise TypeError('Invalid type %s for HTTPResponse ctor param headers.'
                            % type(headers))
        
        if not isinstance(read, basestring):
            raise TypeError('Invalid type %s for HTTPResponse ctor param read.'
                            % type(read))

        self._charset = charset
        self._headers = None
        self._body = None
        self._raw_body = read
        self._content_type = None
        self._dom = None
        self._clear_text_body = None
        # A unique id identifier for the response
        self.id = _id
        # From cache defaults to False
        self._from_cache = False
        # Set the info
        self._info = headers
        # Set code
        self.set_code(code)

        # Set the URL variables
        # The URL that we really GET'ed
        self._realurl = original_url.uri2url()
        self._uri = original_url
        # The URL where we were redirected to (equal to original_url
        # when no redirect)
        self._redirected_url = geturl
        self._redirected_uri = geturl.uri2url()

        # Set the rest
        self._msg = msg
        self._time = time
        self._alias = alias
        self._doc_type = None
        
        # Internal lock
        self._body_lock = threading.RLock()

    @classmethod
    def from_httplib_resp(cls, httplibresp, original_url=None):
        '''
        Factory function. Build a HTTPResponse object from a httplib.HTTPResponse
        instance
    
        :param httplibresp: httplib.HTTPResponse instance
        :param original_url: Optional 'url_object' instance.
    
        :return: A HTTPResponse instance
        '''
        resp = httplibresp
        code, msg, hdrs, body = (resp.code, resp.msg, resp.info(), resp.read())
        hdrs = Headers(hdrs.items())
    
        if original_url:
            url_inst = URL(resp.geturl(), original_url.encoding)
            url_inst = url_inst.url_decode()
        else:
            url_inst = original_url = URL(resp.geturl())
    
        charset = getattr(resp, 'encoding', None)
        return cls(code, body, hdrs, url_inst, original_url,
                   msg, charset=charset)

    @classmethod    
    def from_dict(cls, unserialized_dict):
        '''
        * msgpack is MUCH faster than cPickle,
        * msgpack can't serialize python objects,
        * I have to create a dict representation of HTTPResponse to serialize it,
        * and a from_dict to have the object back
        
        :param unserialized_dict: A dict just as returned by to_dict()
        '''
        udict = unserialized_dict
        
        code, msg, hdrs = udict['code'], udict['msg'], udict['headers']
        body, _time, _id = udict['body'], udict['time'], udict['id']
        
        headers_inst = Headers(hdrs.items())
        url = URL(udict['uri'])
    
        return cls(code, body, headers_inst, url, url, msg=msg, _id=_id,
                   time=_time)

    def to_dict(self):
        '''
        :return: A dict that represents the current object and is serializable
                 by the json or msgpack modules.
        '''
        serializable_dict = {}
        sdict = serializable_dict
        
        # Note: The Headers() object can be serialized by msgpack because it
        #       inherits from dict() and doesn't mangle it too much
        sdict['code'], sdict['msg'], sdict['headers'] = (self.get_code(),
                                                         self.get_msg(),
                                                         self.get_headers())
        sdict['body'], sdict['time'], sdict['id'] = (self.get_body(),
                                                     self.get_wait_time(),
                                                     self.get_id())
        
        sdict['uri'] = self.get_uri().url_string
    
        return serializable_dict

    def __contains__(self, string_to_test):
        '''
        Determine if the `string_to_test` is contained by the HTTP response
        body.

        :param string_to_test: String to look for in the body
        '''
        return string_to_test in self.body
    
    def __eq__(self, other):
        return self.id == other.id and self._code == other._code and \
               self.headers == other.headers and self.body == other.body and \
               self._uri == other._uri

    def __repr__(self):

        vals = {
            'code': self.get_code(),
            'url': str(self.get_url()),
            'id': self.id and ' | id:%s' % self.id or '',
            'fcache': self._from_cache and ' | fromCache:True' or ''
        }
        return '<HTTPResponse | %(code)s | %(url)s%(id)s%(fcache)s>' % vals

    def set_id(self, _id):
        self.id = _id

    def get_id(self):
        return self.id

    def set_code(self, code):
        self._code = code

    def get_code(self):
        return self._code

    def get_body(self):
        with self._body_lock:
            if self._body is None:
                self._body, self._charset = self._charset_handling()
                # Free 'raw_body'
                self._raw_body = None
            return self._body

    def set_body(self, body):
        '''
        Setter for body.

        @body: A string that represents the body of the HTTP response
        '''
        if not isinstance(body, basestring):
            msg = 'Invalid type %s for set_body parameter body.'
            raise TypeError(msg % type(body))
            
        self._body = None
        self._raw_body = body
    
    body = property(get_body, set_body)

    def get_clear_text_body(self):
        '''
        :return: A clear text representation of the HTTP response body.
        '''
        clear_text_body = self._clear_text_body

        if clear_text_body is None:
            
            # Calculate the clear text body
            dom = self.get_dom()
            if dom is not None:
                clear_text_body = ''.join(dom.itertext())
            else:
                clear_text_body = ANY_TAG_MATCH.sub('', self.get_body())
            
            self._clear_text_body = clear_text_body

        return clear_text_body

    def set_dom(self, dom_inst):
        '''
        This setter is part of a performance improvement I'm talking about in
        get_dom() and sgmlParser._parse().

        Without this set_dom() which is called from sgmlParser._parse() when the
        code runs:
            sgmlParser( http_response )
            ...
            http_response.get_dom()

        The DOM is calculated twice.

        We still need to figure out how to solve the other issue which should
        aim to avoid the double DOM generation when:
            http_response.get_dom()
            ...
            sgmlParser( http_response )

        :return: None
        '''
        self._dom = dom_inst

    def get_dom(self):
        '''
        I don't want to calculate the DOM for all responses, only for those
        which are needed. This method will first calculate the DOM, and then
        save it for upcoming calls.

        @see: TODO: Potential performance improvement in sgmlParser._parse()
                    for ideas on how to reduce CPU usage.

        :return: The DOM, or None if the HTML normalization failed.
        '''
        if self._dom is None:
            try:
                parser = etree.HTMLParser(recover=True)
                self._dom = etree.fromstring(self.body, parser)
            except Exception:
                msg = ('The HTTP body for "%s" could NOT be parsed by lxml.'
                       % self.get_url())
                om.out.debug(msg)
        return self._dom

    def get_charset(self):
        if not self._charset:
            self._body, self._charset = self._charset_handling()
            # Free 'raw_body'
            self._raw_body = None
        return self._charset

    def set_charset(self, charset):
        self._charset = charset
    
    charset = property(get_charset, set_charset)
    
    def set_redir_url(self, ru):
        self._redirected_url = ru

    def get_redir_url(self):
        return self._redirected_url

    def set_redir_uri(self, ru):
        self._redirected_uri = ru

    def get_redir_uri(self):
        return self._redirected_uri

    def get_headers(self):
        if self._headers is None:
            self.headers = self._info
            assert self._headers is not None
        return self._headers

    def set_headers(self, headers):
        '''
        Sets the headers and also analyzes them in order to get the response
        mime type (text/html , application/pdf, etc).

        :param headers: The headers dict.
        '''
        # Fix lowercase in header names from HTTPMessage
        if isinstance(headers, httplib.HTTPMessage):
            self._headers = Headers()
            for header in headers.headers:
                key, value = header.split(':', 1)
                self._headers[key.strip()] = value.strip()
        else:
            self._headers = headers

        # Set the type, for easy access.
        self._doc_type = HTTPResponse.DOC_TYPE_OTHER
        find_word = lambda w: content_type.find(w) != -1

        content_type_hvalue, _ = self._headers.iget('content-type', None)

        # we need exactly content type but not charset
        if content_type_hvalue is not None:
            try:
                self._content_type = content_type_hvalue.split(';', 1)[0]
            except:
                msg = 'Invalid Content-Type value "%s" sent in HTTP response.'
                om.out.debug(msg % (content_type_hvalue,))
            else:
                content_type = self._content_type.lower()

                # Set the doc_type
                if content_type.count('image'):
                    self._doc_type = HTTPResponse.DOC_TYPE_IMAGE

                elif content_type.count('pdf'):
                    self._doc_type = HTTPResponse.DOC_TYPE_PDF

                elif content_type.count('x-shockwave-flash'):
                    self._doc_type = HTTPResponse.DOC_TYPE_SWF

                elif any(imap(find_word,
                              ('text', 'html', 'xml', 'txt', 'javascript'))):
                    self._doc_type = HTTPResponse.DOC_TYPE_TEXT_OR_HTML

    headers = property(get_headers, set_headers)
    
    def get_lower_case_headers(self):
        '''
        If the original headers were:
            {'Abc-Def': 'F00N3s'}
        This will return:
            {'abc-def': 'F00N3s'}

        The only thing that changes is the header name.
        '''
        lcase_headers = dict(
            (k.lower(), v) for k, v in self.headers.iteritems())
        return Headers(lcase_headers.items())

    def set_url(self, url):
        '''
        >>> url = URL('http://www.google.com')
        >>> r = HTTPResponse(200, '' , Headers(), url, url)
        >>> r.set_url('http://www.google.com/')
        Traceback (most recent call last):
          ...
        TypeError: The URL of a HTTPResponse object must be of url.URL type.
        >>> r.set_url(url)
        >>> r.get_url() == url
        True
        '''
        if not isinstance(url, URL):
            raise TypeError('The URL of a HTTPResponse object must be of '
                            'url.URL type.')

        self._realurl = url.uri2url()

    def get_url(self):
        return self._realurl

    def set_uri(self, uri):
        '''
        >>> uri = URL('http://www.google.com/')
        >>> r = HTTPResponse(200, '' , Headers(), uri, uri)
        >>> r.set_uri('http://www.google.com/')
        Traceback (most recent call last):
          ...
        TypeError: The URI of a HTTPResponse object must be of url.URL type.
        >>> r.set_uri(uri)
        >>> r.get_uri() == uri
        True

        '''
        if not isinstance(uri, URL):
            raise TypeError('The URI of a HTTPResponse object must be of '
                            'url.URL type.')

        self._uri = uri
        self._realurl = uri.uri2url()

    def get_uri(self):
        return self._uri

    def was_redirected(self):
        return self._uri != self._redirected_uri

    def set_from_cache(self, fcache):
        '''
        :param fcache: True if this response was obtained from the
        local cache.
        '''
        self._from_cache = fcache

    def get_from_cache(self):
        '''
        :return: True if this response was obtained from the local cache.
        '''
        return self._from_cache

    def set_wait_time(self, t):
        self._time = t

    def get_wait_time(self):
        return self._time

    def set_alias(self, alias):
        self._alias = alias

    def get_alias(self):
        return self._alias

    def info(self):
        return self._info

    def get_status_line(self):
        '''Return status-line of response.'''
        return 'HTTP/1.1' + SP + str(self._code) + SP + self._msg + CRLF

    def get_msg(self):
        return self._msg

    def _charset_handling(self):
        '''
        Decode the body based on the header (or metadata) encoding.
        The implemented algorithm follows the encoding detection logic
        used by FF:

            1) First try to find a charset using the following search criteria:
                a) Look in the 'content-type' HTTP header. Example:
                    content-type: text/html; charset=iso-8859-1
                b) Look in the 'meta' HTML header. Example:
                    <meta .* content="text/html; charset=utf-8" />
                c) Determine the charset using the chardet module (TODO)
                d) Use the DEFAULT_CHARSET

            2) Try to decode the body using the found charset. If it fails,
            then force it to use the DEFAULT_CHARSET

        Finally return the unicode (decoded) body and the used charset.

        Note: If the body is already a unicode string return it as it is.
        '''
        lcase_headers = self.get_lower_case_headers()
        charset = self._charset
        rawbody = self._raw_body

        # Only try to decode <str> strings. Skip <unicode> strings
        if type(rawbody) is unicode:
            _body = rawbody
            assert charset is not None, ("HTTPResponse objects containing "
                                         "unicode body must have an associated "
                                         "charset")
        elif 'content-type' not in lcase_headers:
            _body = rawbody
            charset = DEFAULT_CHARSET

            if len(_body):
                msg = "The remote web server failed to send the 'content-type'"\
                      " header in HTTP response with id %s" % self.id
                om.out.debug(msg)

        elif not self.is_text_or_html():
            # Not text, save as it is.
            _body = rawbody
            charset = charset or DEFAULT_CHARSET
        else:
            # Figure out charset to work with
            if not charset:
                charset = self.guess_charset(rawbody, lcase_headers)

            # Now that we have the charset, we use it!
            # The return value of the decode function is a unicode string.
            try:
                _body = smart_unicode(
                    rawbody,
                    charset,
                    errors=ESCAPED_CHAR,
                    on_error_guess=False
                )
            except LookupError:
                # Warn about a buggy charset
                msg = ('Charset LookupError: unknown charset: %s; '
                       'ignored and set to default: %s' %
                      (charset, self._charset))
                om.out.debug(msg)
                # Forcing it to use the default
                charset = DEFAULT_CHARSET
                _body = smart_unicode(
                    rawbody,
                    charset,
                    errors=ESCAPED_CHAR,
                    on_error_guess=False
                )

        return _body, charset

    def guess_charset(self, rawbody, headers):
        # Start with the headers
        charset_mo = CHARSET_EXTRACT_RE.search(headers['content-type'], re.I)
        if charset_mo:
            # Seems like the response's headers contain a charset
            charset = charset_mo.groups()[0].lower().strip()
        else:
            # Continue with the body's meta tag
            charset_mo = CHARSET_META_RE.search(rawbody, re.IGNORECASE)
            if charset_mo:
                charset = charset_mo.groups()[0].lower().strip()
            else:
                charset = DEFAULT_CHARSET
        
        return charset

    @property
    def content_type(self):
        '''
        The content type of the response
        '''
        if self._content_type is None:
            self.headers = self._info
        return self._content_type or ''

    @property
    def doc_type(self):
        if self._doc_type is None:
            self.headers = self._info
            assert self._doc_type is not None
        return self._doc_type

    def is_text_or_html(self):
        '''
        :return: True if this response is text or html
        '''
        return self.doc_type == HTTPResponse.DOC_TYPE_TEXT_OR_HTML

    def is_pdf(self):
        '''
        :return: True if this response is a PDF file
        '''
        return self.doc_type == HTTPResponse.DOC_TYPE_PDF

    def is_swf(self):
        '''
        :return: True if this response is a SWF file
        '''
        return self.doc_type == HTTPResponse.DOC_TYPE_SWF

    def is_image(self):
        '''
        :return: True if this response is an image file
        '''
        return self.doc_type == HTTPResponse.DOC_TYPE_IMAGE

    def dump_response_head(self):
        '''
        :return: A string with:
            HTTP/1.1 /login.html 200
            Header1: Value1
            Header2: Value2
        '''
        dump_head = "%s%s" % (self.get_status_line(), self.dump_headers())
        if type(dump_head) is unicode:
            dump_head = dump_head.encode(self.charset)
        return dump_head

    def dump(self):
        '''
        Return a DETAILED str representation of this HTTP response object.
        '''
        body = self.body
        # Images, pdf and binary responses in general are never decoded
        # to unicode
        if isinstance(body, unicode):
            body = body.encode(DEFAULT_CHARSET, 'replace')
        return "%s%s%s" % (self.dump_response_head(), CRLF, body)

    def dump_headers(self):
        '''
        :return: a str representation of the headers.
        '''
        if self.headers:
            return CRLF.join(h + ': ' + hv for h, hv in self.headers.items()) + CRLF
        else:
            return ''

    def copy(self):
        return copy.deepcopy(self)

    def __getstate__(self):
        state = self.__dict__.copy()
        state.pop('_body_lock')
        return state
    
    def __setstate__(self, state):
        self.__dict__ = state
        self._body_lock = threading.RLock()