Esempio n. 1
0
    def test_appendlist(self):
        h1 = Headers({'header1': 'value1'})
        h1.appendlist('header1', 'value3')
        self.assertEqual(h1.getlist('header1'), ['value1', 'value3'])

        h1 = Headers()
        h1.appendlist('header1', 'value1')
        h1.appendlist('header1', 'value3')
        self.assertEqual(h1.getlist('header1'), ['value1', 'value3'])
Esempio n. 2
0
    def test_appendlist(self):
        h1 = Headers({'header1': 'value1'})
        h1.appendlist('header1', 'value3')
        self.assertEqual(h1.getlist('header1'), ['value1', 'value3'])

        h1 = Headers()
        h1.appendlist('header1', 'value1')
        h1.appendlist('header1', 'value3')
        self.assertEqual(h1.getlist('header1'), ['value1', 'value3'])
Esempio n. 3
0
    def test_appendlist(self):
        h1 = Headers({"header1": "value1"})
        h1.appendlist("header1", "value3")
        self.assertEqual(h1.getlist("header1"), [b"value1", b"value3"])

        h1 = Headers()
        h1.appendlist("header1", "value1")
        h1.appendlist("header1", "value3")
        self.assertEqual(h1.getlist("header1"), [b"value1", b"value3"])
Esempio n. 4
0
class ScrapyHTTPPageGetter(HTTPClient):

    delimiter = '\n'

    def connectionMade(self):
        self.headers = Headers() # bucket for response headers

        # Method command
        self.sendCommand(self.factory.method, self.factory.path)
        # Headers
        for key, values in self.factory.headers.items():
            for value in values:
                self.sendHeader(key, value)
        self.endHeaders()
        # Body
        if self.factory.body is not None:
            self.transport.write(self.factory.body)

    def extractHeader(self, header):
        key, val = header.split(':', 1)
        val = val.lstrip()
        self.handleHeader(key, val)
        if key.lower() == 'content-length':
            self.length = int(val)

    def lineReceived(self, line):
        try:
            HTTPClient.lineReceived(self, line.rstrip())
        except:
            self.factory.add_invalid_header(line)

    def handleHeader(self, key, value):
        self.headers.appendlist(key, value)

    def handleStatus(self, version, status, message):
        self.factory.gotStatus(version, status, message)

    def handleEndHeaders(self):
        self.factory.gotHeaders(self.headers)

    def connectionLost(self, reason):
        HTTPClient.connectionLost(self, reason)
        self.factory.noPage(reason)

    def handleResponse(self, response):
        if self.factory.method.upper() == 'HEAD':
            self.factory.page('')
        else:
            self.factory.page(response)
        self.transport.loseConnection()

    def timeout(self):
        self.transport.loseConnection()
        self.factory.noPage(\
                defer.TimeoutError("Getting %s took longer than %s seconds." % \
                (self.factory.url, self.factory.timeout)))
Esempio n. 5
0
class ScrapyHTTPPageGetter(HTTPClient):

    delimiter = b'\n'

    def connectionMade(self):
        self.headers = Headers()  # bucket for response headers

        # Method command
        self.sendCommand(self.factory.method, self.factory.path)
        # Headers
        for key, values in self.factory.headers.items():
            for value in values:
                self.sendHeader(key, value)
        self.endHeaders()
        # Body
        if self.factory.body is not None:
            self.transport.write(self.factory.body)

    def lineReceived(self, line):
        return HTTPClient.lineReceived(self, line.rstrip())

    def handleHeader(self, key, value):
        self.headers.appendlist(key, value)

    def handleStatus(self, version, status, message):
        self.factory.gotStatus(version, status, message)

    def handleEndHeaders(self):
        self.factory.gotHeaders(self.headers)

    def connectionLost(self, reason):
        self._connection_lost_reason = reason
        HTTPClient.connectionLost(self, reason)
        self.factory.noPage(reason)

    def handleResponse(self, response):
        if self.factory.method.upper() == b'HEAD':
            self.factory.page(b'')
        elif self.length is not None and self.length > 0:
            self.factory.noPage(self._connection_lost_reason)
        else:
            self.factory.page(response)
        self.transport.loseConnection()

    def timeout(self):
        self.transport.loseConnection()

        # transport cleanup needed for HTTPS connections
        if self.factory.url.startswith(b'https'):
            self.transport.stopProducing()

        self.factory.noPage(\
                defer.TimeoutError("Getting %s took longer than %s seconds." % \
                (self.factory.url, self.factory.timeout)))
Esempio n. 6
0
class ScrapyHTTPPageGetter(HTTPClient):

    delimiter = b'\n'

    def connectionMade(self):
        self.headers = Headers() # bucket for response headers

        # Method command
        self.sendCommand(self.factory.method, self.factory.path)
        # Headers
        for key, values in self.factory.headers.items():
            for value in values:
                self.sendHeader(key, value)
        self.endHeaders()
        # Body
        if self.factory.body is not None:
            self.transport.write(self.factory.body)

    def lineReceived(self, line):
        return HTTPClient.lineReceived(self, line.rstrip())

    def handleHeader(self, key, value):
        self.headers.appendlist(key, value)

    def handleStatus(self, version, status, message):
        self.factory.gotStatus(version, status, message)

    def handleEndHeaders(self):
        self.factory.gotHeaders(self.headers)

    def connectionLost(self, reason):
        self._connection_lost_reason = reason
        HTTPClient.connectionLost(self, reason)
        self.factory.noPage(reason)

    def handleResponse(self, response):
        if self.factory.method.upper() == b'HEAD':
            self.factory.page(b'')
        elif self.length is not None and self.length > 0:
            self.factory.noPage(self._connection_lost_reason)
        else:
            self.factory.page(response)
        self.transport.loseConnection()

    def timeout(self):
        self.transport.loseConnection()

        # transport cleanup needed for HTTPS connections
        if self.factory.url.startswith(b'https'):
            self.transport.stopProducing()

        self.factory.noPage(\
                defer.TimeoutError("Getting %s took longer than %s seconds." % \
                (self.factory.url, self.factory.timeout)))
Esempio n. 7
0
class ScrapyHTTPPageGetter(HTTPClient):

    delimiter = '\n'

    def connectionMade(self):
        self.headers = Headers() # bucket for response headers

        # Method command
        self.sendCommand(self.factory.method, self.factory.path)
        # Headers
        for key, values in self.factory.headers.items():
            for value in values:
                self.sendHeader(key, value)
        self.endHeaders()
        # Body
        if self.factory.body is not None:
            self.transport.write(self.factory.body)

    def lineReceived(self, line):
        return HTTPClient.lineReceived(self, line.rstrip())

    def handleHeader(self, key, value):
        self.headers.appendlist(key, value)

    def handleStatus(self, version, status, message):
        self.factory.gotStatus(version, status, message)

    def handleEndHeaders(self):
        self.factory.gotHeaders(self.headers)

    def connectionLost(self, reason):
        HTTPClient.connectionLost(self, reason)
        self.factory.noPage(reason)

    def handleResponse(self, response):
        if self.factory.method.upper() == 'HEAD':
            self.factory.page('')
        elif self.length != None and self.length != 0:
            self.factory.noPage(failure.Failure(
                PartialDownloadError(self.factory.status, None, response)))
        else:
            self.factory.page(response)
        self.transport.loseConnection()

    def timeout(self):
        self.transport.loseConnection()
        self.factory.noPage(\
                defer.TimeoutError("Getting %s took longer than %s seconds." % \
                (self.factory.url, self.factory.timeout)))
Esempio n. 8
0
    def test_netscape_example_2(self):
        # Second Example transaction sequence:
        #
        # Assume all mappings from above have been cleared.
        #
        # Client receives:
        #
        #       Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/
        #
        # When client requests a URL in path "/" on this server, it sends:
        #
        #       Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001
        #
        # Client receives:
        #
        #       Set-Cookie: PART_NUMBER=RIDING_ROCKET_0023; path=/ammo
        #
        # When client requests a URL in path "/ammo" on this server, it sends:
        #
        #       Cookie: PART_NUMBER=RIDING_ROCKET_0023; PART_NUMBER=ROCKET_LAUNCHER_0001
        #
        #       NOTE: There are two name/value pairs named "PART_NUMBER" due to
        #       the inheritance of the "/" mapping in addition to the "/ammo" mapping.

        c = CookieJar()
        headers = Headers({'Set-Cookie': 'PART_NUMBER=ROCKET_LAUNCHER_0001; path=/'})

        req = Request("http://www.acme.com/")
        res = Response("http://www.acme.com/", headers=headers)

        c.extract_cookies(res, req)

        req = Request("http://www.acme.com/")
        c.add_cookie_header(req)

        self.assertEquals(req.headers.get("Cookie"), "PART_NUMBER=ROCKET_LAUNCHER_0001")

        headers.appendlist("Set-Cookie", "PART_NUMBER=RIDING_ROCKET_0023; path=/ammo")
        res = Response("http://www.acme.com/", headers=headers)
        c.extract_cookies(res, req)

        req = Request("http://www.acme.com/ammo")
        c.add_cookie_header(req)

        self.assert_(re.search(r"PART_NUMBER=RIDING_ROCKET_0023;\s*"
                               "PART_NUMBER=ROCKET_LAUNCHER_0001",
                               req.headers.get("Cookie")))
Esempio n. 9
0
    def test_session_cookies(self):
        year_plus_one = time.localtime()[0] + 1

        # Check session cookies are deleted properly by
        # CookieJar.clear_session_cookies method

        req = Request('http://www.perlmeister.com/scripts')
        headers = Headers()
        headers.appendlist("Set-Cookie", "s1=session;Path=/scripts")
        headers.appendlist("Set-Cookie", "p1=perm; Domain=.perlmeister.com;Path=/;expires=Fri, 02-Feb-%d 23:24:20 GMT" % year_plus_one)
        headers.appendlist("Set-Cookie", "p2=perm;Path=/;expires=Fri, 02-Feb-%d 23:24:20 GMT" % year_plus_one)
        headers.appendlist("Set-Cookie", "s2=session;Path=/scripts;" "Domain=.perlmeister.com")
        headers.appendlist('Set-Cookie2', 's3=session;Version=1;Discard;Path="/"')
        res = Response('http://www.perlmeister.com/scripts', headers=headers)

        c = CookieJar()
        c.extract_cookies(res, req)
        # How many session/permanent cookies do we have?
        counter = {"session_after": 0,
                   "perm_after": 0,
                   "session_before": 0,
                   "perm_before": 0}
        for cookie in c:
            key = "%s_before" % cookie.value
            counter[key] = counter[key] + 1
        c.clear_session_cookies()
        # How many now?
        for cookie in c:
            key = "%s_after" % cookie.value
            counter[key] = counter[key] + 1

        self.assert_(not (
            # a permanent cookie got lost accidently
            counter["perm_after"] != counter["perm_before"] or
            # a session cookie hasn't been cleared
            counter["session_after"] != 0 or
            # we didn't have session cookies in the first place
            counter["session_before"] == 0))
Esempio n. 10
0
    def test_netscape_misc(self):
        # Some additional Netscape cookies tests.
        c = CookieJar()
        headers = Headers()
        req = Request("http://foo.bar.acme.com/foo")

        # Netscape allows a host part that contains dots
        headers.appendlist("Set-Cookie", "Customer=WILE_E_COYOTE; domain=.acme.com")
        res = Response("http://www.acme.com/foo", headers=headers)
        c.extract_cookies(res, req)

        # and that the domain is the same as the host without adding a leading
        # dot to the domain.  Should not quote even if strange chars are used
        # in the cookie value.
        headers.appendlist("Set-Cookie", "PART_NUMBER=3,4; domain=foo.bar.acme.com")
        res = Response("http://www.acme.com/foo", headers=headers)
        c.extract_cookies(res, req)

        req = Request("http://foo.bar.acme.com/foo")
        c.add_cookie_header(req)
        self.assert_(
            "PART_NUMBER=3,4" in req.headers.get("Cookie") and
            "Customer=WILE_E_COYOTE" in req.headers.get("Cookie"))
Esempio n. 11
0
class ScrapyHTTPPageGetter(HTTPClient):

    delimiter = '\n'

    def connectionMade(self):
        self.headers = Headers() # bucket for response headers

        if self.factory.use_tunnel:
            log.msg("Sending CONNECT", log.DEBUG)
            self.tunnel_started = False
            self.sendCommand("CONNECT", "%s:%s"
                % (self.factory.tunnel_to_host, self.factory.tunnel_to_port))
            self.sendHeaders(only=['Host','Proxy-Connection', 'User-Agent'])
            del self.factory.headers['Proxy-Connection']
        else:
            self.sendEverything()


    def sendCommand(self, command, path):
        if self.factory.use_tunnel and not self.tunnel_started:
            http_version = "1.1"
        else:
            http_version = "1.0"
        self.transport.write('%s %s HTTP/%s\r\n' % (command, path, http_version))

    def sendEverything(self):
        self.sendMethod()
        self.sendHeaders()
        self.sendBody()

    def sendMethod(self):
        # Method command
        self.sendCommand(self.factory.method, self.factory.path)

    def sendHeaders(self, only=None):
        # Note: it's a Headers object, not a dict
        keys = only if only is not None else self.factory.headers.keys()
        for key in keys:
            for value in self.factory.headers.getlist(key):
                self.sendHeader(key, value)
        self.endHeaders()

    def sendBody(self):
        # Body
        if self.factory.body is not None:
            self.transport.write(self.factory.body)

    def lineReceived(self, line):
        if self.factory.use_tunnel and not self.tunnel_started: log.msg("LINE: %s" % line)
        if self.factory.use_tunnel and not self.tunnel_started and not line.rstrip():
            # End of headers from the proxy in response to our CONNECT request
            # Skip the call to HTTPClient.lienReceived for now, since otherwise
            # it would switch to row mode.
            self.startTunnel()
        else:
            return HTTPClient.lineReceived(self, line.rstrip())

    def startTunnel(self):
        log.msg("starting Tunnel")

        # We'll get a new batch of headers through the tunnel. This sets us
        # up to capture them.
        self.firstLine = True
        self.tunnel_started = True

        # Switch to SSL
        ctx = ClientContextFactory()
        self.transport.startTLS(ctx, self.factory)

        # And send the normal request:
        self.sendEverything()


    def handleHeader(self, key, value):
        if self.factory.use_tunnel and not self.tunnel_started:
             pass # maybe log headers for CONNECT request?
        else:
             self.headers.appendlist(key, value)

    def handleStatus(self, version, status, message):
        if self.factory.use_tunnel and not self.tunnel_started:
             self.tunnel_status = status
        else:
             self.factory.gotStatus(version, status, message)

    def handleEndHeaders(self):
        self.factory.gotHeaders(self.headers)

    def connectionLost(self, reason):
        HTTPClient.connectionLost(self, reason)
        self.factory.noPage(reason)

    def handleResponse(self, response):
        if self.factory.method.upper() == 'HEAD':
            self.factory.page('')
        else:
            self.factory.page(response)
        self.transport.loseConnection()

    def timeout(self):
        self.transport.loseConnection()
        self.factory.noPage(\
                defer.TimeoutError("Getting %s took longer than %s seconds." % \
                (self.factory.url, self.factory.timeout)))
Esempio n. 12
0
    def test_netscape_example_1(self):
        #-------------------------------------------------------------------
        # First we check that it works for the original example at
        # http://www.netscape.com/newsref/std/cookie_spec.html

        # Client requests a document, and receives in the response:
        #
        #       Set-Cookie: CUSTOMER=WILE_E_COYOTE; path=/; expires=Wednesday, 09-Nov-99 23:12:40 GMT
        #
        # When client requests a URL in path "/" on this server, it sends:
        #
        #       Cookie: CUSTOMER=WILE_E_COYOTE
        #
        # Client requests a document, and receives in the response:
        #
        #       Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/
        #
        # When client requests a URL in path "/" on this server, it sends:
        #
        #       Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001
        #
        # Client receives:
        #
        #       Set-Cookie: SHIPPING=FEDEX; path=/fo
        #
        # When client requests a URL in path "/" on this server, it sends:
        #
        #       Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001
        #
        # When client requests a URL in path "/foo" on this server, it sends:
        #
        #       Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001; SHIPPING=FEDEX
        #
        # The last Cookie is buggy, because both specifications say that the
        # most specific cookie must be sent first.  SHIPPING=FEDEX is the
        # most specific and should thus be first.

        year_plus_one = time.localtime()[0] + 1

        c = CookieJar(DefaultCookiePolicy(rfc2965 = True))

        #req = Request("http://1.1.1.1/",
        #              headers={"Host": "www.acme.com:80"})
        req = Request("http://www.acme.com:80/", headers={"Host": "www.acme.com:80"})

        headers = Headers()
        headers['Set-Cookie'] = 'CUSTOMER=WILE_E_COYOTE; path=/ ; expires=Wednesday, 09-Nov-%d 23:12:40 GMT' % year_plus_one
        res = Response("http://www.acme.com/", headers=headers)
        c.extract_cookies(res, req)

        req = Request("http://www.acme.com/")
        c.add_cookie_header(req)

        self.assertEqual(req.headers.get("Cookie"), "CUSTOMER=WILE_E_COYOTE")
        self.assertEqual(req.headers.get("Cookie2"), '$Version="1"')

        headers.appendlist("Set-Cookie", "PART_NUMBER=ROCKET_LAUNCHER_0001; path=/")
        res = Response("http://www.acme.com/", headers=headers)
        c.extract_cookies(res, req)

        req = Request("http://www.acme.com/foo/bar")
        c.add_cookie_header(req)

        h = req.headers.get("Cookie")
        self.assert_("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and
                     "CUSTOMER=WILE_E_COYOTE" in h)

        headers.appendlist('Set-Cookie', 'SHIPPING=FEDEX; path=/foo')
        res = Response("http://www.acme.com", headers=headers)
        c.extract_cookies(res, req)

        req = Request("http://www.acme.com/")
        c.add_cookie_header(req)

        h = req.headers.get("Cookie")
        self.assert_("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and
                     "CUSTOMER=WILE_E_COYOTE" in h and
                     "SHIPPING=FEDEX" not in h)

        req = Request("http://www.acme.com/foo/")
        c.add_cookie_header(req)

        h = req.headers.get("Cookie")
        self.assert_(("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and
                      "CUSTOMER=WILE_E_COYOTE" in h and
                      h.startswith("SHIPPING=FEDEX;")))