Example #1
0
    def get_all_headers(self):
        """
        :return: Calls get_default_headers to get the default framework headers,
        get_post_data_headers to get the DataContainer headers, merges that info
        with the user specified headers (which live in self._headers) and
        returns a Headers instance which will be sent to the wire.
        """
        wire_headers = Headers()

        for k, v in chain(self._headers.items(),
                          self.get_post_data_headers().items()):

            # Please note that here we're overwriting the headers from the
            # fuzzable request with the headers from the data container,
            # the overwriting is done in this order due to the order in the
            # chain() items above
            #
            # I found a bug where I loaded a request from spider_man, saved
            # it using dump() and then tried to load it again and failed because
            # of this overwriting not being done (the multipart boundary was
            # incorrect).
            #
            # Keep that in mind in case you want to change this overwriting!
            #
            # Overwrite the existing one, case insensitive style
            _, stored_header_name = wire_headers.iget(k, None)
            if stored_header_name is not None:
                wire_headers[stored_header_name] = v
            else:
                wire_headers[k] = v

        return wire_headers
Example #2
0
    def get_all_headers(self):
        """
        :return: Calls get_default_headers to get the default framework headers,
        get_post_data_headers to get the DataContainer headers, merges that info
        with the user specified headers (which live in self._headers) and
        returns a Headers instance which will be sent to the wire.
        """
        wire_headers = Headers()

        for k, v in chain(self._headers.items(),
                          self.get_post_data_headers().items()):

            # Please note that here we're overwriting the headers from the
            # fuzzable request with the headers from the data container,
            # the overwriting is done in this order due to the order in the
            # chain() items above
            #
            # I found a bug where I loaded a request from spider_man, saved
            # it using dump() and then tried to load it again and failed because
            # of this overwriting not being done (the multipart boundary was
            # incorrect).
            #
            # Keep that in mind in case you want to change this overwriting!
            #
            # Overwrite the existing one, case insensitive style
            _, stored_header_name = wire_headers.iget(k, None)
            if stored_header_name is not None:
                wire_headers[stored_header_name] = v
            else:
                wire_headers[k] = v

        return wire_headers
Example #3
0
    def test_headers_iget(self):
        upper_headers = Headers([('Abc', 'b')])

        value, real_header = upper_headers.iget('abc')

        self.assertEqual(value, 'b')
        self.assertEqual(real_header, 'Abc')
Example #4
0
File: factory.py Project: EnDe/w3af
def dc_from_hdrs_post(headers, post_data):
    """
    :param headers: HTTP request headers, most importantly containing the
                    content-type info.
    :param post_data: The HTTP request post-data as a string
    :return: The best-match from POST_DATA_CONTAINERS to hold the information
             in self._post_data @ FuzzableRequest
    """
    if headers is None:
        headers = Headers()

    for pdc_klass in POST_DATA_CONTAINERS:
        try:
            return pdc_klass.from_postdata(headers, post_data)
        except (ValueError, TypeError) as e:
            pass
    else:
        content_type, _ = headers.iget("content-type", "None")
        msg = 'Unknown post-data. Content-type: "%s" and/or post-data "%s"'
        om.out.debug(msg % (content_type, post_data[:50]))

        # These lines are for debugging
        # import traceback
        # traceback.print_stack()

        return PlainContainer.from_postdata(headers, post_data)
Example #5
0
def dc_from_hdrs_post(headers, post_data):
    """
    :param headers: HTTP request headers, most importantly containing the
                    content-type info.
    :param post_data: The HTTP request post-data as a string
    :return: The best-match from POST_DATA_CONTAINERS to hold the information
             in self._post_data @ FuzzableRequest
    """
    if headers is None:
        headers = Headers()

    for pdc_klass in POST_DATA_CONTAINERS:
        try:
            return pdc_klass.from_postdata(headers, post_data)
        except (ValueError, TypeError) as e:
            pass
    else:
        content_type, _ = headers.iget('content-type', 'None')
        msg = 'Unknown post-data. Content-type: "%s" and/or post-data "%s"'
        om.out.debug(msg % (content_type, post_data[:50]))

        # These lines are for debugging
        # import traceback
        # traceback.print_stack()

        return PlainContainer.from_postdata(headers, post_data)
Example #6
0
    def get_headers(self):
        """
        Query the spec / operation and return the headers, including the
        content-type, which will be used later to know how to serialize the
        body.
        """
        request_dict = self._bravado_construct_request()
        headers = Headers(request_dict['headers'].items())

        # First, we try to extract content type from a 'consumes'
        # if the operation has one.
        content_type = self.get_consuming_content_type()
        if content_type is not None:
            headers['Content-Type'] = content_type

        content_type, _ = headers.iget('content-type', None)
        if content_type is None and self.parameters:
            # Content-Type is not set yet.
            #
            # There are some specification documents where the consumes
            # section might be empty. This is because the operation doesn't
            # receive anything or because the specification is wrong.
            #
            # If there are parameters then we opt for serializing them as
            # JSON, which is a safe default
            headers['Content-Type'] = self.DEFAULT_CONTENT_TYPE

        return headers
Example #7
0
    def get_headers(self):
        """
        Query the spec / operation and return the headers, including the
        content-type, which will be used later to know how to serialize the
        body.
        """
        request_dict = self._bravado_construct_request()
        headers = Headers(request_dict['headers'].items())

        content_type, _ = headers.iget('content-type', None)
        if content_type is None:
            # The content type is not in the headers, so we try to extract
            # it from the operation.
            #
            # The REST API endpoint might support more than one content-type
            # for consuming it. We only use the first one since in 99% of the cases
            # a vulnerability which we find using one content-type will be present
            # in others. This works the other way around also, there are very few
            # vulnerabilities which are going to be exploitable with one content-
            # type.
            if self.operation.consumes:
                content_type = self.operation.consumes[0]
                headers['Content-Type'] = content_type
            else:
                # Finally, there are some specification documents where the consumes
                # section might be empty. This is because the operation doesn't
                # receive anything or because the specification is wrong.
                #
                # If there are parameters then we opt for serializing them as
                # JSON, which is a safe default
                if self.parameters:
                    headers['Content-Type'] = self.DEFAULT_CONTENT_TYPE

        return headers
Example #8
0
    def test_headers_iget(self):
        upper_headers = Headers([('Abc', 'b')])

        value, real_header = upper_headers.iget('abc')

        self.assertEqual(value, 'b')
        self.assertEqual(real_header, 'Abc')
Example #9
0
    def get_headers(self):
        """
        Query the spec / operation and return the headers, including the
        content-type, which will be used later to know how to serialize the
        body.
        """
        request_dict = self._bravado_construct_request()
        headers = Headers(request_dict['headers'].items())

        # First, we try to extract content type from a 'consumes'
        # if the operation has one.
        content_type = self.get_consuming_content_type()
        if content_type is not None:
            headers['Content-Type'] = content_type

        content_type, _ = headers.iget('content-type', None)
        if content_type is None and self.parameters:
            # Content-Type is not set yet.
            #
            # There are some specification documents where the consumes
            # section might be empty. This is because the operation doesn't
            # receive anything or because the specification is wrong.
            #
            # If there are parameters then we opt for serializing them as
            # JSON, which is a safe default
            headers['Content-Type'] = self.DEFAULT_CONTENT_TYPE

        return headers
    def get_all_headers(self):
        """
        :return: Calls get_default_headers to get the default framework headers,
        get_post_data_headers to get the DataContainer headers, merges that info
        with the user specified headers (which live in self._headers) and
        returns a Headers instance which will be sent to the wire.
        """
        wire_headers = Headers()

        for k, v in chain(self._headers.items(),
                          self.get_post_data_headers().items()):

            # Ignore any keys which are already defined in the user-specified
            # headers
            kvalue, kreal = wire_headers.iget(k, None)
            if kvalue is not None:
                continue

            wire_headers[k] = v

        return wire_headers
Example #11
0
    def get_all_headers(self):
        """
        :return: Calls get_default_headers to get the default framework headers,
        get_post_data_headers to get the DataContainer headers, merges that info
        with the user specified headers (which live in self._headers) and
        returns a Headers instance which will be sent to the wire.
        """
        wire_headers = Headers()

        for k, v in chain(self._headers.items(),
                          self.get_post_data_headers().items()):

            # Ignore any keys which are already defined in the user-specified
            # headers
            kvalue, kreal = wire_headers.iget(k, None)
            if kvalue is not None:
                continue

            wire_headers[k] = v

        return wire_headers
Example #12
0
class HTTPResponse(DiskItem):

    DOC_TYPE_TEXT_OR_HTML = 'DOC_TYPE_TEXT_OR_HTML'
    DOC_TYPE_SWF = 'DOC_TYPE_SWF'
    DOC_TYPE_PDF = 'DOC_TYPE_PDF'
    DOC_TYPE_IMAGE = 'DOC_TYPE_IMAGE'
    DOC_TYPE_OTHER = 'DOC_TYPE_OTHER'

    __slots__ = ('_code', '_charset', '_headers', '_body', '_raw_body',
                 '_binary_response', '_content_type', '_dom', 'id',
                 '_from_cache', '_info', '_realurl', '_uri', '_redirected_url',
                 '_redirected_uri', '_msg', '_time', '_alias', '_doc_type',
                 '_body_lock', '_debugging_id')

    def __init__(self,
                 code,
                 read,
                 headers,
                 geturl,
                 original_url,
                 msg='OK',
                 _id=None,
                 time=DEFAULT_WAIT_TIME,
                 alias=None,
                 charset=None,
                 binary_response=False,
                 set_body=False,
                 debugging_id=None):
        """
        :param code: HTTP code
        :param read: HTTP body text; typically a string
        :param headers: HTTP headers, typically a dict or a httplib.HTTPMessage
        :param geturl: URL object instance
        :param original_url: URL object instance
        :param msg: HTTP message
        :param _id: Optional response identifier
        :param time: The time between the request and the response
        :param alias: Alias for the response, this contains a hash that helps
                      the backend sqlite find http_responses faster by indexing
                      by this attr.
        :param charset: Response's encoding; obligatory when `read` is unicode
        """
        if not isinstance(geturl, URL):
            msg = 'Invalid type %s for HTTPResponse ctor param geturl.'
            raise TypeError(msg % type(geturl))

        if not isinstance(original_url, URL):
            msg = 'Invalid type %s for HTTPResponse ctor param original_url.'
            raise TypeError(msg % type(original_url))

        if not isinstance(headers, Headers):
            msg = 'Invalid type %s for HTTPResponse ctor param headers.'
            raise TypeError(msg % type(headers))

        if not isinstance(read, basestring):
            raise TypeError(
                'Invalid type %s for HTTPResponse ctor param read.' %
                type(read))

        self._charset = charset
        self._headers = None

        if set_body and isinstance(read, unicode):
            # We use this case for deserialization via from_dict()
            #
            # The goal is to prevent the body to be analyzed for charset data
            # once again, since it was already done during to_dict() in the
            # get_body() call.
            self._body = self._raw_body = read
        else:
            self._body = None
            self._raw_body = read

        self._binary_response = binary_response
        self._content_type = None
        self._dom = None
        # A unique id identifier for the response
        self.id = _id
        # From cache defaults to False
        self._from_cache = False
        # Set the info
        self._info = headers
        # Set code
        self._code = None
        self.set_code(code)

        # Set the URL variables
        # The URL that we really GET'ed
        self._realurl = original_url.uri2url()
        self._uri = original_url
        # The URL where we were redirected to (equal to original_url
        # when no redirect)
        self._redirected_url = geturl.uri2url()
        self._redirected_uri = geturl

        # Set the rest
        self._msg = smart_unicode(msg)
        self._time = time
        self._alias = alias
        self._doc_type = None
        self._debugging_id = debugging_id

        # Internal lock
        self._body_lock = threading.RLock()

    @classmethod
    def from_httplib_resp(cls,
                          httplibresp,
                          original_url=None,
                          binary_response=False):
        """
        Factory function. Build a HTTPResponse object from a
        httplib.HTTPResponse instance
    
        :param httplibresp: httplib.HTTPResponse instance
        :param original_url: Optional 'url_object' instance.
    
        :return: A HTTPResponse instance
        """
        resp = httplibresp
        code, msg, hdrs, body = (resp.code, resp.msg, resp.info(), resp.read())
        hdrs = Headers(hdrs.items())

        if original_url:
            url_inst = URL(resp.geturl(), original_url.encoding)
            url_inst = url_inst.url_decode()
        else:
            url_inst = original_url = URL(resp.geturl())

        httplib_time = DEFAULT_WAIT_TIME
        if hasattr(httplibresp, 'get_wait_time'):
            # This is defined in the keep alive http response object
            httplib_time = httplibresp.get_wait_time()

        if isinstance(resp, urllib2.HTTPError):
            # This is possible because in errors.py I do:
            # err = urllib2.HTTPError(req.get_full_url(), code, msg, hdrs, resp)
            charset = getattr(resp.fp, 'encoding', None)
        else:
            # The encoding attribute is only set on CachedResponse instances
            charset = getattr(resp, 'encoding', None)

        return cls(code,
                   body,
                   hdrs,
                   url_inst,
                   original_url,
                   msg,
                   charset=charset,
                   time=httplib_time,
                   binary_response=binary_response)

    @classmethod
    def from_dict(cls, unserialized_dict):
        """
        * msgpack is MUCH faster than cPickle,
        * msgpack can't serialize python objects,
        * I have to create a dict representation of HTTPResponse to serialize it
        * and a from_dict to have the object back
        
        :param unserialized_dict: A dict just as returned by to_dict()
        """
        code = unserialized_dict['code']
        msg = unserialized_dict['msg']
        headers = unserialized_dict['headers']
        body = unserialized_dict['body']
        charset = unserialized_dict['charset']
        _time = unserialized_dict['time']
        _id = unserialized_dict['id']
        url = URL(unserialized_dict['uri'])
        debugging_id = unserialized_dict['debugging_id']

        headers_inst = Headers(headers.items())

        return cls(code,
                   body,
                   headers_inst,
                   url,
                   url,
                   msg=msg,
                   _id=_id,
                   time=_time,
                   charset=charset,
                   set_body=True,
                   debugging_id=debugging_id)

    def to_dict(self):
        """
        :return: A dict that represents the current object and is serializable
                 by the json or msgpack modules.
        """
        # Note: The Headers() object can be serialized by msgpack because it
        #       inherits from dict() and doesn't mangle it too much
        return {
            'headers': dict(self.get_headers()),
            'code': self.get_code(),
            'msg': self.get_msg(),
            'body': self.get_body(),
            'time': self.get_wait_time(),
            'id': self.get_id(),
            'charset': self.get_charset(),
            'uri': self.get_uri().url_string,
            'debugging_id': self._debugging_id
        }

    def get_eq_attrs(self):
        return ('_code', '_charset', '_headers', '_body', '_raw_body',
                '_binary_response', '_content_type', 'id', '_from_cache',
                '_info', '_realurl', '_uri', '_redirected_url',
                '_redirected_uri', '_msg', '_time', '_alias', '_doc_type',
                '_debugging_id')

    def __contains__(self, string_to_test):
        """
        Determine if the `string_to_test` is contained by the HTTP response
        body.

        :param string_to_test: String to look for in the body
        """
        return string_to_test in self.body

    def __eq__(self, other):
        return (self.id == other.id and self._code == other._code
                and self.headers == other.headers and self.body == other.body
                and self._uri == other._uri)

    def __repr__(self):
        vals = {
            'code': self.get_code(),
            'url': str(self.get_url()),
            'id': self.id and ' | id:%s' % self.id or '',
            'fcache': self._from_cache and ' | fcache:True' or ''
        }
        return '<HTTPResponse | %(code)s | %(url)s%(id)s%(fcache)s>' % vals

    def set_id(self, _id):
        self.id = _id

    def get_id(self):
        return self.id

    def set_debugging_id(self, debugging_id):
        self._debugging_id = debugging_id

    def get_debugging_id(self):
        return self._debugging_id

    def set_code(self, code):
        self._code = code

    def get_code(self):
        return self._code

    @staticmethod
    def _quick_hash(text):
        return '%s%s' % (hash(text), zlib.adler32(text))

    def get_body_hash(self):
        body = smart_str_ignore(self.get_body())
        return self._quick_hash(body)

    def get_hash(self, exclude_headers=None):
        exclude_headers = [] or exclude_headers

        headers = self.dump_response_head(exclude_headers=exclude_headers)
        body = smart_str_ignore(self.get_body())

        args = (headers, body)
        dump = '%s%s' % args

        return self._quick_hash(dump)

    def get_body(self):
        if self._body is not None:
            return self._body

        with self._body_lock:
            self._body, self._charset = self._charset_handling()

            # The user wants the raw body, without any modifications / decoding?
            if not self._binary_response:
                self._raw_body = None

            return self._body

    def set_body(self, body):
        """
        Setter for body.

        @body: A string that represents the body of the HTTP response
        """
        if not isinstance(body, basestring):
            msg = 'Invalid type %s for set_body parameter body.'
            raise TypeError(msg % type(body))

        self._body = None
        self._raw_body = body

    body = property(get_body, set_body)

    def get_raw_body(self):
        """
        Return the raw body as it came from the wire.

        This is useful when we want to parse binary files such as images and DS_Store.

        IMPORTANT! Because we want to save some memory the raw body will be set
                   to None after the first call to get_body(), so please use
                   binary_response in your requests in order to let us know
                   that you want the raw body

        :return: The raw body as it came from the wire
        """
        return self._raw_body

    def get_clear_text_body(self):
        """
        Just a shortcut to get the clear text body
        :return: A unicode string
        """
        parser = self.get_parser()
        if parser is not None:
            return parser.get_clear_text_body()

        return u''

    def get_parser(self):
        """
        Just a shortcut to get the parser for this response, we get this from
        the document parser cache.

        :return: A DocumentParser instance or None
        """
        try:
            return parser_cache.dpc.get_document_parser_for(self)
        except BaseFrameworkException:
            # Failed to find a suitable parser for the document
            return

    def get_charset(self):
        if self._charset:
            return self._charset

        with self._body_lock:
            self._body, self._charset = self._charset_handling()

            # The user wants the raw body, without any modifications / decoding?
            if not self._binary_response:
                self._raw_body = None

        return self._charset

    def set_charset(self, charset):
        self._charset = charset

    charset = property(get_charset, set_charset)

    def set_redir_url(self, ru):
        self._redirected_url = ru

    def get_redir_url(self):
        return self._redirected_url

    def set_redir_uri(self, ru):
        self._redirected_uri = ru

    def get_redir_uri(self):
        return self._redirected_uri

    def get_headers(self):
        if self._headers is None:
            self.headers = self._info
            assert self._headers is not None
        return self._headers

    def set_headers(self, headers):
        """
        Sets the headers and also analyzes them in order to get the response
        mime type (text/html , application/pdf, etc).

        :param headers: The headers dict.
        """
        # Fix lowercase in header names from HTTPMessage
        if isinstance(headers, httplib.HTTPMessage):
            self._headers = Headers()
            for header in headers.headers:
                key, value = header.split(':', 1)
                self._headers[key.strip()] = value.strip()
        else:
            self._headers = headers

        find_word = lambda w: content_type.find(w) != -1

        content_type_hvalue, _ = self._headers.iget(CONTENT_TYPE, None)

        # we need exactly content type but not charset
        if content_type_hvalue is not None:
            try:
                self._content_type = content_type_hvalue.split(
                    ';', 1)[0].strip().lower()
            except:
                msg = 'Invalid Content-Type value "%s" sent in HTTP response.'
                om.out.debug(msg % (content_type_hvalue, ))
            else:
                content_type = self._content_type

                # Set the doc_type
                if content_type.count('image'):
                    self._doc_type = HTTPResponse.DOC_TYPE_IMAGE

                elif content_type.count('pdf'):
                    self._doc_type = HTTPResponse.DOC_TYPE_PDF

                elif content_type.count('x-shockwave-flash'):
                    self._doc_type = HTTPResponse.DOC_TYPE_SWF

                elif any(
                        imap(find_word,
                             ('text', 'html', 'xml', 'txt', 'javascript'))):
                    self._doc_type = HTTPResponse.DOC_TYPE_TEXT_OR_HTML

        # Check if the doc type is still None, that would mean that none of the
        # previous if statements matched.
        #
        # Note that I'm doing this here and not before the other if statements
        # because that triggered a race condition with threads asking if the
        # _doc_type was != None (which it was because I was setting it to
        # DOC_TYPE_OTHER) and that raised all types of errors.
        if self._doc_type is None:
            self._doc_type = HTTPResponse.DOC_TYPE_OTHER

    headers = property(get_headers, set_headers)

    def get_lower_case_headers(self):
        """
        If the original headers were:
            {'Abc-Def': 'F00N3s'}
        This will return:
            {'abc-def': 'F00N3s'}

        The only thing that changes is the header name.
        """
        return Headers([(k.lower(), v) for k, v in self.headers.iteritems()])

    def set_url(self, url):
        """
        >>> url = URL('http://www.google.com')
        >>> r = HTTPResponse(200, '' , Headers(), url, url)
        >>> r.set_url('http://www.google.com/')
        Traceback (most recent call last):
          ...
        TypeError: The URL of a HTTPResponse object must be of url.URL type.
        >>> r.set_url(url)
        >>> r.get_url() == url
        True
        """
        if not isinstance(url, URL):
            raise TypeError('The URL of a HTTPResponse object must be of '
                            'url.URL type.')

        self._realurl = url.uri2url()

    def get_url(self):
        return self._realurl

    def get_host(self):
        return self.get_url().get_domain()

    def set_uri(self, uri):
        """
        >>> uri = URL('http://www.google.com/')
        >>> r = HTTPResponse(200, '' , Headers(), uri, uri)
        >>> r.set_uri('http://www.google.com/')
        Traceback (most recent call last):
          ...
        TypeError: The URI of a HTTPResponse object must be of url.URL type.
        >>> r.set_uri(uri)
        >>> r.get_uri() == uri
        True

        """
        if not isinstance(uri, URL):
            raise TypeError('The URI of a HTTPResponse object must be of '
                            'url.URL type.')

        self._uri = uri
        self._realurl = uri.uri2url()

    def get_uri(self):
        return self._uri

    def was_redirected(self):
        return self._uri != self._redirected_uri

    def set_from_cache(self, fcache):
        """
        :param fcache: True if this response was obtained from the
        local cache.
        """
        self._from_cache = fcache

    def get_from_cache(self):
        """
        :return: True if this response was obtained from the local cache.
        """
        return self._from_cache

    def set_wait_time(self, t):
        self._time = t

    def get_wait_time(self):
        return self._time

    def set_alias(self, alias):
        self._alias = alias

    def get_alias(self):
        return self._alias

    def info(self):
        return self._info

    def get_status_line(self):
        """
        Return status-line of response.
        """
        return STATUS_LINE % (self._code, self._msg)

    def get_msg(self):
        return self._msg

    def _charset_handling(self):
        """
        Decode the body based on the header (or metadata) encoding.
        The implemented algorithm follows the encoding detection logic
        used by FF:

            1) First try to find a charset using the following search criteria:
                a) Look in the CONTENT_TYPE HTTP header. Example:
                    content-type: text/html; charset=iso-8859-1
                b) Look in the 'meta' HTML header. Example:
                    <meta .* content="text/html; charset=utf-8" />
                c) Determine the charset using the chardet module (TODO)
                d) Use the DEFAULT_CHARSET

            2) Try to decode the body using the found charset. If it fails,
            then force it to use the DEFAULT_CHARSET

        Finally return the unicode (decoded) body and the used charset.

        Note: If the body is already a unicode string return it as it is.
        """
        charset = self._charset
        raw_body = self._raw_body
        headers = self.get_headers()
        content_type, _ = headers.iget(CONTENT_TYPE, None)

        # Only try to decode <str> strings. Skip <unicode> strings
        if type(raw_body) is unicode:
            _body = raw_body
            assert charset is not None, (
                "HTTPResponse objects containing "
                "unicode body must have an associated "
                "charset")
        elif content_type is None:
            _body = raw_body
            charset = DEFAULT_CHARSET

            if _body:
                msg = ('The remote web server failed to send the CONTENT_TYPE'
                       ' header in HTTP response with id %s')
                om.out.debug(msg % self.id)

        elif not self.is_text_or_html():
            # Not text, save as it is.
            _body = raw_body
            charset = charset or DEFAULT_CHARSET
        else:
            # Figure out charset to work with
            if not charset:
                charset = self.guess_charset(raw_body, headers)

            # Now that we have the charset, we use it!
            # The return value of the decode function is a unicode string.
            try:
                _body = smart_unicode(raw_body,
                                      charset,
                                      errors=ESCAPED_CHAR,
                                      on_error_guess=False)
            except LookupError:
                # Warn about a buggy charset
                msg = ('Charset LookupError: unknown charset: %s; '
                       'ignored and set to default: %s' %
                       (charset, DEFAULT_CHARSET))
                om.out.debug(msg)

                # Forcing it to use the default
                charset = DEFAULT_CHARSET
                _body = smart_unicode(raw_body,
                                      charset,
                                      errors=ESCAPED_CHAR,
                                      on_error_guess=False)

        return _body, charset

    def guess_charset(self, raw_body, headers):
        # Start with the headers
        content_type, _ = headers.iget(CONTENT_TYPE, None)
        charset_mo = CHARSET_EXTRACT_RE.search(content_type, re.I)
        if charset_mo:
            # Seems like the response's headers contain a charset
            charset = charset_mo.groups()[0].lower().strip()
        else:
            # Continue with the body's meta tag
            charset_mo = CHARSET_META_RE.search(raw_body, re.IGNORECASE)
            if charset_mo:
                charset = charset_mo.groups()[0].lower().strip()
            else:
                charset = DEFAULT_CHARSET

        return charset

    @property
    def content_type(self):
        """
        The content type of the response
        """
        if self._content_type is None:
            self.headers = self._info
        return self._content_type or ''

    @property
    def doc_type(self):
        if self._doc_type is None:
            self.headers = self._info
            assert self._doc_type is not None
        return self._doc_type

    def is_text_or_html(self):
        """
        :return: True if this response is text or html
        """
        return self.doc_type == HTTPResponse.DOC_TYPE_TEXT_OR_HTML

    def is_pdf(self):
        """
        :return: True if this response is a PDF file
        """
        return self.doc_type == HTTPResponse.DOC_TYPE_PDF

    def is_swf(self):
        """
        :return: True if this response is a SWF file
        """
        return self.doc_type == HTTPResponse.DOC_TYPE_SWF

    def is_image(self):
        """
        :return: True if this response is an image file
        """
        return self.doc_type == HTTPResponse.DOC_TYPE_IMAGE

    def dump_response_head(self, exclude_headers=None):
        """
        :return: A byte-string, as we would send to the wire, containing:

            HTTP/1.1 /login.html 200
            Header1: Value1
            Header2: Value2

        """
        exclude_headers = exclude_headers or []
        status_line = self.get_status_line()
        dumped_headers = self.dump_headers(exclude_headers=exclude_headers)

        dump_head = '%s%s' % (status_line, dumped_headers)

        if isinstance(dump_head, unicode):
            dump_head = dump_head.encode(self.charset, 'replace')

        return dump_head

    def dump(self):
        """
        Return a DETAILED str representation of this HTTP response object.
        """
        body = self.body

        # Images, pdf and binary responses in general are never decoded
        # to unicode
        if isinstance(body, unicode):
            body = body.encode(self.charset, 'replace')

        return '%s%s%s' % (self.dump_response_head(), CRLF, body)

    def dump_headers(self, exclude_headers=None):
        """
        :return: a str representation of the headers.
        """
        exclude_headers = exclude_headers or []

        if self.headers:
            return CRLF.join('%s: %s' % (h, hv)
                             for (h, hv) in self.headers.items()
                             if h.lower() not in exclude_headers) + CRLF
        else:
            return ''

    def get_redirect_destination(self):
        lower_headers = self.get_lower_case_headers()
        redirect_url = None

        for header_name in ('location', 'uri'):
            if header_name in lower_headers:
                header_value = lower_headers[header_name]
                header_value = header_value.strip()

                try:
                    redirect_url = self.get_url().url_join(header_value)
                except ValueError:
                    # No special invalid URL handling required
                    continue
                else:
                    break

        return redirect_url

    def does_redirect_outside_target(self):
        """
        :return: True when the redirect destination is not the same
                 domain and protocol than the originally requested URL
        """
        redirect_destination = self.get_redirect_destination()

        if redirect_destination is None:
            return False

        # Check if the protocol was changed:
        original_proto = self.get_url().get_protocol()
        redirect_proto = redirect_destination.get_protocol()

        if original_proto != redirect_proto:
            return True

        # Check if the domain was changed:
        original_domain = self.get_url().get_domain()
        redirect_domain = redirect_destination.get_domain()

        if original_domain != redirect_domain:
            return True

        return False

    def copy(self):
        return copy.deepcopy(self)

    def __getstate__(self):
        state = {k: getattr(self, k) for k in self.__slots__}
        state.pop('_body_lock')
        return state

    def __setstate__(self, state):
        [setattr(self, k, v) for k, v in state.iteritems()]
        self._body_lock = threading.RLock()
Example #13
0
def http_request_parser(head, postdata):
    """
    This function parses HTTP Requests from a string to a FuzzableRequest.

    :param head: The head of the request.
    :param postdata: The post data of the request
    :return: A FuzzableRequest object with all the corresponding information
             that was sent in head and postdata

    :author: Andres Riancho ([email protected])
    """
    # Parse the request head, the strip() helps us deal with the \r (if any)
    split_head = head.split('\n')
    split_head = [h.strip() for h in split_head if h]

    if not split_head:
        msg = 'The HTTP request is invalid.'
        raise BaseFrameworkException(msg)

    # Get method, uri, version
    method_uri_version = split_head[0]
    first_line = method_uri_version.split(' ')
    if len(first_line) == 3:
        # Ok, we have something like "GET /foo HTTP/1.0". This is the best case
        # for us!
        method, uri, version = first_line

    elif len(first_line) < 3:
        msg = 'The HTTP request has an invalid <method> <uri> <version>: "%s"'
        raise BaseFrameworkException(msg % method_uri_version)

    elif len(first_line) > 3:
        # GET /hello world.html HTTP/1.0
        # Mostly because we are permissive... we are going to try to parse
        # the request...
        method = first_line[0]
        version = first_line[-1]
        uri = ' '.join(first_line[1:-1])

    check_version_syntax(version)

    # If we got here, we have a nice method, uri, version first line
    # Now we parse the headers (easy!) and finally we send the request
    headers_str = split_head[1:]
    headers_inst = Headers()
    for header in headers_str:
        one_split_header = header.split(':', 1)
        if len(one_split_header) == 1:
            msg = 'The HTTP request has an invalid header: "%s".'
            raise BaseFrameworkException(msg % header)

        header_name = one_split_header[0].strip()
        header_value = one_split_header[1].strip()
        if header_name in headers_inst:
            headers_inst[header_name] += ', ' + header_value
        else:
            headers_inst[header_name] = header_value

    host, _ = headers_inst.iget('host', None)

    try:
        uri = URL(check_uri_syntax(uri, host))
    except ValueError, ve:
        raise BaseFrameworkException(str(ve))
Example #14
0
def http_request_parser(head, postdata):
    """
    This function parses HTTP Requests from a string to a FuzzableRequest.

    :param head: The head of the request.
    :param postdata: The post data of the request
    :return: A FuzzableRequest object with all the corresponding information
             that was sent in head and postdata

    :author: Andres Riancho ([email protected])
    """
    # Parse the request head, the strip() helps us deal with the \r (if any)
    split_head = head.split("\n")
    split_head = [h.strip() for h in split_head if h]

    if not split_head:
        msg = "The HTTP request is invalid."
        raise BaseFrameworkException(msg)

    # Get method, uri, version
    method_uri_version = split_head[0]
    first_line = method_uri_version.split(" ")
    if len(first_line) == 3:
        # Ok, we have something like "GET /foo HTTP/1.0". This is the best case
        # for us!
        method, uri, version = first_line

    elif len(first_line) < 3:
        msg = 'The HTTP request has an invalid <method> <uri> <version>: "%s"'
        raise BaseFrameworkException(msg % method_uri_version)

    elif len(first_line) > 3:
        # GET /hello world.html HTTP/1.0
        # Mostly because we are permissive... we are going to try to parse
        # the request...
        method = first_line[0]
        version = first_line[-1]
        uri = " ".join(first_line[1:-1])

    check_version_syntax(version)

    # If we got here, we have a nice method, uri, version first line
    # Now we parse the headers (easy!) and finally we send the request
    headers_str = split_head[1:]
    headers_inst = Headers()
    for header in headers_str:
        one_split_header = header.split(":", 1)
        if len(one_split_header) == 1:
            msg = 'The HTTP request has an invalid header: "%s".'
            raise BaseFrameworkException(msg % header)

        header_name = one_split_header[0].strip()
        header_value = one_split_header[1].strip()
        if header_name in headers_inst:
            headers_inst[header_name] += ", " + header_value
        else:
            headers_inst[header_name] = header_value

    host, _ = headers_inst.iget("host", None)

    try:
        uri = URL(check_uri_syntax(uri, host))
    except ValueError, ve:
        raise BaseFrameworkException(str(ve))
Example #15
0
class HTTPResponse(object):

    DOC_TYPE_TEXT_OR_HTML = 'DOC_TYPE_TEXT_OR_HTML'
    DOC_TYPE_SWF = 'DOC_TYPE_SWF'
    DOC_TYPE_PDF = 'DOC_TYPE_PDF'
    DOC_TYPE_IMAGE = 'DOC_TYPE_IMAGE'
    DOC_TYPE_OTHER = 'DOC_TYPE_OTHER'

    __slots__ = ('_code',
                 '_charset',
                 '_headers',
                 '_body',
                 '_raw_body',
                 '_content_type',
                 '_dom',
                 'id',
                 '_from_cache',
                 '_info',
                 '_realurl',
                 '_uri',
                 '_redirected_url',
                 '_redirected_uri',
                 '_msg',
                 '_time',
                 '_alias',
                 '_doc_type',
                 '_body_lock')

    def __init__(self, code, read, headers, geturl, original_url,
                 msg='OK', _id=None, time=DEFAULT_WAIT_TIME, alias=None,
                 charset=None):
        """
        :param code: HTTP code
        :param read: HTTP body text; typically a string
        :param headers: HTTP headers, typically a dict or a httplib.HTTPMessage
        :param geturl: URL object instance
        :param original_url: URL object instance
        :param msg: HTTP message
        :param _id: Optional response identifier
        :param time: The time between the request and the response
        :param alias: Alias for the response, this contains a hash that helps
                      the backend sqlite find http_responses faster by indexing
                      by this attr.
        :param charset: Response's encoding; obligatory when `read` is unicode
        """
        if not isinstance(geturl, URL):
            msg = 'Invalid type %s for HTTPResponse ctor param geturl.'
            raise TypeError(msg % type(geturl))

        if not isinstance(original_url, URL):
            msg = 'Invalid type %s for HTTPResponse ctor param original_url.'
            raise TypeError(msg % type(original_url))

        if not isinstance(headers, Headers):
            msg = 'Invalid type %s for HTTPResponse ctor param headers.'
            raise TypeError(msg % type(headers))
        
        if not isinstance(read, basestring):
            raise TypeError('Invalid type %s for HTTPResponse ctor param read.'
                            % type(read))

        self._charset = charset
        self._headers = None
        self._body = None
        self._raw_body = read
        self._content_type = None
        self._dom = None
        # A unique id identifier for the response
        self.id = _id
        # From cache defaults to False
        self._from_cache = False
        # Set the info
        self._info = headers
        # Set code
        self.set_code(code)

        # Set the URL variables
        # The URL that we really GET'ed
        self._realurl = original_url.uri2url()
        self._uri = original_url
        # The URL where we were redirected to (equal to original_url
        # when no redirect)
        self._redirected_url = geturl
        self._redirected_uri = geturl.uri2url()

        # Set the rest
        self._msg = smart_unicode(msg)
        self._time = time
        self._alias = alias
        self._doc_type = None
        
        # Internal lock
        self._body_lock = threading.RLock()

    @classmethod
    def from_httplib_resp(cls, httplibresp, original_url=None):
        """
        Factory function. Build a HTTPResponse object from a
        httplib.HTTPResponse instance
    
        :param httplibresp: httplib.HTTPResponse instance
        :param original_url: Optional 'url_object' instance.
    
        :return: A HTTPResponse instance
        """
        resp = httplibresp
        code, msg, hdrs, body = (resp.code, resp.msg, resp.info(), resp.read())
        hdrs = Headers(hdrs.items())

        if original_url:
            url_inst = URL(resp.geturl(), original_url.encoding)
            url_inst = url_inst.url_decode()
        else:
            url_inst = original_url = URL(resp.geturl())

        httplib_time = DEFAULT_WAIT_TIME
        if hasattr(httplibresp, 'get_wait_time'):
            # This is defined in the keep alive http response object
            httplib_time = httplibresp.get_wait_time()

        if isinstance(resp, urllib2.HTTPError):
            # This is possible because in errors.py I do:
            # err = urllib2.HTTPError(req.get_full_url(), code, msg, hdrs, resp)
            charset = getattr(resp.fp, 'encoding', None)
        else:
            # The encoding attribute is only set on CachedResponse instances
            charset = getattr(resp, 'encoding', None)
        
        return cls(code, body, hdrs, url_inst, original_url,
                   msg, charset=charset, time=httplib_time)

    @classmethod    
    def from_dict(cls, unserialized_dict):
        """
        * msgpack is MUCH faster than cPickle,
        * msgpack can't serialize python objects,
        * I have to create a dict representation of HTTPResponse to serialize it
        * and a from_dict to have the object back
        
        :param unserialized_dict: A dict just as returned by to_dict()
        """
        udict = unserialized_dict
        
        code, msg, hdrs = udict['code'], udict['msg'], udict['headers']
        body, _time, _id = udict['body'], udict['time'], udict['id']
        
        headers_inst = Headers(hdrs.items())
        url = URL(udict['uri'])
    
        return cls(code, body, headers_inst, url, url, msg=msg, _id=_id,
                   time=_time)

    def to_dict(self):
        """
        :return: A dict that represents the current object and is serializable
                 by the json or msgpack modules.
        """
        serializable_dict = {}
        sdict = serializable_dict
        
        # Note: The Headers() object can be serialized by msgpack because it
        #       inherits from dict() and doesn't mangle it too much
        sdict['code'], sdict['msg'], sdict['headers'] = (self.get_code(),
                                                         self.get_msg(),
                                                         dict(self.get_headers()))
        sdict['body'], sdict['time'], sdict['id'] = (self.get_body(),
                                                     self.get_wait_time(),
                                                     self.get_id())
        
        sdict['uri'] = self.get_uri().url_string
    
        return serializable_dict

    def __contains__(self, string_to_test):
        """
        Determine if the `string_to_test` is contained by the HTTP response
        body.

        :param string_to_test: String to look for in the body
        """
        return string_to_test in self.body
    
    def __eq__(self, other):
        return (self.id == other.id and
                self._code == other._code and
                self.headers == other.headers and
                self.body == other.body and
                self._uri == other._uri)

    def __repr__(self):
        vals = {
            'code': self.get_code(),
            'url': str(self.get_url()),
            'id': self.id and ' | id:%s' % self.id or '',
            'fcache': self._from_cache and ' | fcache:True' or ''
        }
        return '<HTTPResponse | %(code)s | %(url)s%(id)s%(fcache)s>' % vals

    def set_id(self, _id):
        self.id = _id

    def get_id(self):
        return self.id

    def set_code(self, code):
        self._code = code

    def get_code(self):
        return self._code

    def get_body(self):
        with self._body_lock:
            if self._body is None:
                self._body, self._charset = self._charset_handling()
                # Free 'raw_body'
                self._raw_body = None
            return self._body

    def set_body(self, body):
        """
        Setter for body.

        @body: A string that represents the body of the HTTP response
        """
        if not isinstance(body, basestring):
            msg = 'Invalid type %s for set_body parameter body.'
            raise TypeError(msg % type(body))
            
        self._body = None
        self._raw_body = body

    body = property(get_body, set_body)

    def get_clear_text_body(self):
        """
        Just a shortcut to get the clear text body
        :return: A unicode string
        """
        parser = self.get_parser()
        if parser is not None:
            return parser.get_clear_text_body()

        return u''

    def get_parser(self):
        """
        Just a shortcut to get the parser for this response, we get this from
        the document parser cache.

        :return: A DocumentParser instance or None
        """
        try:
            return parser_cache.dpc.get_document_parser_for(self)
        except BaseFrameworkException:
            # Failed to find a suitable parser for the document
            return

    def get_charset(self):
        if not self._charset:
            self._body, self._charset = self._charset_handling()
            # Free 'raw_body'
            self._raw_body = None
        return self._charset

    def set_charset(self, charset):
        self._charset = charset
    
    charset = property(get_charset, set_charset)
    
    def set_redir_url(self, ru):
        self._redirected_url = ru

    def get_redir_url(self):
        return self._redirected_url

    def set_redir_uri(self, ru):
        self._redirected_uri = ru

    def get_redir_uri(self):
        return self._redirected_uri

    def get_headers(self):
        if self._headers is None:
            self.headers = self._info
            assert self._headers is not None
        return self._headers

    def set_headers(self, headers):
        """
        Sets the headers and also analyzes them in order to get the response
        mime type (text/html , application/pdf, etc).

        :param headers: The headers dict.
        """
        # Fix lowercase in header names from HTTPMessage
        if isinstance(headers, httplib.HTTPMessage):
            self._headers = Headers()
            for header in headers.headers:
                key, value = header.split(':', 1)
                self._headers[key.strip()] = value.strip()
        else:
            self._headers = headers

        find_word = lambda w: content_type.find(w) != -1

        content_type_hvalue, _ = self._headers.iget(CONTENT_TYPE, None)

        # we need exactly content type but not charset
        if content_type_hvalue is not None:
            try:
                self._content_type = content_type_hvalue.split(';', 1)[0].strip().lower()
            except:
                msg = 'Invalid Content-Type value "%s" sent in HTTP response.'
                om.out.debug(msg % (content_type_hvalue,))
            else:
                content_type = self._content_type

                # Set the doc_type
                if content_type.count('image'):
                    self._doc_type = HTTPResponse.DOC_TYPE_IMAGE

                elif content_type.count('pdf'):
                    self._doc_type = HTTPResponse.DOC_TYPE_PDF

                elif content_type.count('x-shockwave-flash'):
                    self._doc_type = HTTPResponse.DOC_TYPE_SWF

                elif any(imap(find_word,
                              ('text', 'html', 'xml', 'txt', 'javascript'))):
                    self._doc_type = HTTPResponse.DOC_TYPE_TEXT_OR_HTML

        # Check if the doc type is still None, that would mean that none of the
        # previous if statements matched.
        #
        # Note that I'm doing this here and not before the other if statements
        # because that triggered a race condition with threads asking if the
        # _doc_type was != None (which it was because I was setting it to
        # DOC_TYPE_OTHER) and that raised all types of errors.
        if self._doc_type is None:
            self._doc_type = HTTPResponse.DOC_TYPE_OTHER

    headers = property(get_headers, set_headers)

    @memoized
    def get_lower_case_headers(self):
        """
        If the original headers were:
            {'Abc-Def': 'F00N3s'}
        This will return:
            {'abc-def': 'F00N3s'}

        The only thing that changes is the header name.
        """
        return Headers([(k.lower(), v) for k, v in self.headers.iteritems()])

    def set_url(self, url):
        """
        >>> url = URL('http://www.google.com')
        >>> r = HTTPResponse(200, '' , Headers(), url, url)
        >>> r.set_url('http://www.google.com/')
        Traceback (most recent call last):
          ...
        TypeError: The URL of a HTTPResponse object must be of url.URL type.
        >>> r.set_url(url)
        >>> r.get_url() == url
        True
        """
        if not isinstance(url, URL):
            raise TypeError('The URL of a HTTPResponse object must be of '
                            'url.URL type.')

        self._realurl = url.uri2url()

    def get_url(self):
        return self._realurl

    def get_host(self):
        return self.get_url().get_domain()

    def set_uri(self, uri):
        """
        >>> uri = URL('http://www.google.com/')
        >>> r = HTTPResponse(200, '' , Headers(), uri, uri)
        >>> r.set_uri('http://www.google.com/')
        Traceback (most recent call last):
          ...
        TypeError: The URI of a HTTPResponse object must be of url.URL type.
        >>> r.set_uri(uri)
        >>> r.get_uri() == uri
        True

        """
        if not isinstance(uri, URL):
            raise TypeError('The URI of a HTTPResponse object must be of '
                            'url.URL type.')

        self._uri = uri
        self._realurl = uri.uri2url()

    def get_uri(self):
        return self._uri

    def was_redirected(self):
        return self._uri != self._redirected_uri

    def set_from_cache(self, fcache):
        """
        :param fcache: True if this response was obtained from the
        local cache.
        """
        self._from_cache = fcache

    def get_from_cache(self):
        """
        :return: True if this response was obtained from the local cache.
        """
        return self._from_cache

    def set_wait_time(self, t):
        self._time = t

    def get_wait_time(self):
        return self._time

    def set_alias(self, alias):
        self._alias = alias

    def get_alias(self):
        return self._alias

    def info(self):
        return self._info

    def get_status_line(self):
        """
        Return status-line of response.
        """
        return STATUS_LINE % (self._code, self._msg)

    def get_msg(self):
        return self._msg

    def _charset_handling(self):
        """
        Decode the body based on the header (or metadata) encoding.
        The implemented algorithm follows the encoding detection logic
        used by FF:

            1) First try to find a charset using the following search criteria:
                a) Look in the CONTENT_TYPE HTTP header. Example:
                    content-type: text/html; charset=iso-8859-1
                b) Look in the 'meta' HTML header. Example:
                    <meta .* content="text/html; charset=utf-8" />
                c) Determine the charset using the chardet module (TODO)
                d) Use the DEFAULT_CHARSET

            2) Try to decode the body using the found charset. If it fails,
            then force it to use the DEFAULT_CHARSET

        Finally return the unicode (decoded) body and the used charset.

        Note: If the body is already a unicode string return it as it is.
        """
        headers = self.get_headers()
        content_type, _ = headers.iget(CONTENT_TYPE, None)
        charset = self._charset
        rawbody = self._raw_body

        # Only try to decode <str> strings. Skip <unicode> strings
        if type(rawbody) is unicode:
            _body = rawbody
            assert charset is not None, ("HTTPResponse objects containing "
                                         "unicode body must have an associated "
                                         "charset")
        elif content_type is None:
            _body = rawbody
            charset = DEFAULT_CHARSET

            if len(_body):
                msg = ('The remote web server failed to send the CONTENT_TYPE'
                       ' header in HTTP response with id %s')
                om.out.debug(msg % self.id)

        elif not self.is_text_or_html():
            # Not text, save as it is.
            _body = rawbody
            charset = charset or DEFAULT_CHARSET
        else:
            # Figure out charset to work with
            if not charset:
                charset = self.guess_charset(rawbody, headers)

            # Now that we have the charset, we use it!
            # The return value of the decode function is a unicode string.
            try:
                _body = smart_unicode(rawbody,
                                      charset,
                                      errors=ESCAPED_CHAR,
                                      on_error_guess=False)
            except LookupError:
                # Warn about a buggy charset
                msg = ('Charset LookupError: unknown charset: %s; '
                       'ignored and set to default: %s' %
                       (charset, DEFAULT_CHARSET))
                om.out.debug(msg)

                # Forcing it to use the default
                charset = DEFAULT_CHARSET
                _body = smart_unicode(rawbody,
                                      charset,
                                      errors=ESCAPED_CHAR,
                                      on_error_guess=False)

        return _body, charset

    def guess_charset(self, rawbody, headers):
        # Start with the headers
        content_type, _ = headers.iget(CONTENT_TYPE, None)
        charset_mo = CHARSET_EXTRACT_RE.search(content_type, re.I)
        if charset_mo:
            # Seems like the response's headers contain a charset
            charset = charset_mo.groups()[0].lower().strip()
        else:
            # Continue with the body's meta tag
            charset_mo = CHARSET_META_RE.search(rawbody, re.IGNORECASE)
            if charset_mo:
                charset = charset_mo.groups()[0].lower().strip()
            else:
                charset = DEFAULT_CHARSET

        return charset

    @property
    def content_type(self):
        """
        The content type of the response
        """
        if self._content_type is None:
            self.headers = self._info
        return self._content_type or ''

    @property
    def doc_type(self):
        if self._doc_type is None:
            self.headers = self._info
            assert self._doc_type is not None
        return self._doc_type

    def is_text_or_html(self):
        """
        :return: True if this response is text or html
        """
        return self.doc_type == HTTPResponse.DOC_TYPE_TEXT_OR_HTML

    def is_pdf(self):
        """
        :return: True if this response is a PDF file
        """
        return self.doc_type == HTTPResponse.DOC_TYPE_PDF

    def is_swf(self):
        """
        :return: True if this response is a SWF file
        """
        return self.doc_type == HTTPResponse.DOC_TYPE_SWF

    def is_image(self):
        """
        :return: True if this response is an image file
        """
        return self.doc_type == HTTPResponse.DOC_TYPE_IMAGE

    def dump_response_head(self):
        """
        :return: A byte-string, as we would send to the wire, containing:

            HTTP/1.1 /login.html 200
            Header1: Value1
            Header2: Value2

        """
        status_line = self.get_status_line()
        dumped_headers = self.dump_headers()

        dump_head = '%s%s' % (status_line, dumped_headers)

        if isinstance(dump_head, unicode):
            dump_head = dump_head.encode(self.charset, 'replace')

        return dump_head

    def dump(self):
        """
        Return a DETAILED str representation of this HTTP response object.
        """
        body = self.body

        # Images, pdf and binary responses in general are never decoded
        # to unicode
        if isinstance(body, unicode):
            body = body.encode(self.charset, 'replace')

        return "%s%s%s" % (self.dump_response_head(), CRLF, body)

    def dump_headers(self):
        """
        :return: a str representation of the headers.
        """
        if self.headers:
            return CRLF.join(h + ': ' + hv for h, hv in self.headers.items()) + CRLF
        else:
            return ''

    def copy(self):
        return copy.deepcopy(self)

    def __getstate__(self):
        state = {k: getattr(self, k) for k in self.__slots__}
        state.pop('_body_lock')
        return state
    
    def __setstate__(self, state):
        [setattr(self, k, v) for k, v in state.iteritems()]
        self._body_lock = threading.RLock()
Example #16
0
def http_response_parser(head, postdata):
    """
    This function parses HTTP Responses from a string to an HTTPResponse object.

    :param head: The head of the response
    :param postdata: The post data of the response
    :return: An HTTPResponse object with all the corresponding information
             that was sent in headers and post-data
    """
    # Parse the request head, the strip() helps us deal with the \r (if any)
    split_head = head.split('\n')
    split_head = [h.strip() for h in split_head if h]

    if not split_head:
        msg = 'The HTTP response is invalid.'
        raise BaseFrameworkException(msg)

    # Get version code message
    version_code_message = split_head[0]
    first_line = version_code_message.split(' ', 2)

    if len(first_line) == 3:
        # We have something like "HTTP/1.1 200 OK"
        version, code, message = first_line

    elif len(first_line) == 2:
        # We have something like "HTTP/1.1 503"
        version, code = first_line
        message = ''

    else:
        msg = 'The HTTP request has an invalid <version> <code> <message>: "%s"'
        raise BaseFrameworkException(msg % version_code_message)

    try:
        code = int(code)
    except ValueError:
        raise BaseFrameworkException('Invalid HTTP response code %s' % code)

    check_version_syntax(version)

    # If we got here, we have a nice version code message first line
    # Now we parse the headers (easy!) and finally we create the response
    headers_str = split_head[1:]
    headers_inst = Headers()

    for header in headers_str:
        one_split_header = header.split(':', 1)
        if len(one_split_header) == 1:
            msg = ('The HTTP request has an invalid header which does not'
                   ' contain the ":" separator: "%s"')
            raise BaseFrameworkException(msg % header)

        header_name = one_split_header[0].strip()
        header_value = one_split_header[1].strip()

        if header_name in headers_inst:
            # Handle duplicated headers
            headers_inst[header_name] += ', ' + header_value
        else:
            headers_inst[header_name] = header_value

    host, _ = headers_inst.iget('host', None)

    dummy_url = URL('http://w3af.com')

    return HTTPResponse(code, postdata, headers_inst, dummy_url, dummy_url)