Beispiel #1
0
    def _process(self, item):
        url = item["url"]
        log.debug(u"Crawling: %s", url)
        uri = httplib2.iri2uri(url)
        report = {"url": url, "result": None, "status_code": None, "visited": None}

        total_start_time = time.time()

        (scheme, authority, _path, _query, _fragment) = httplib2.parse_uri(uri)
        if scheme is None or authority is None:
            report["result"] = u"Invalid URI"
            return report

        try:
            # this line is copied from robotsparser.py:can_fetch
            urllib.quote(urlparse.urlparse(urllib.unquote(url))[2])
        except KeyError:
            report["result"] = u"Malformed URL quoting."
            return report

        try:
            robot_check_result = self.ask_robots(uri, scheme, authority)
            # Graceful stop thing.
            if robot_check_result is None:
                raise Stop()
        except CrawlError, e:
            report["result"] = unicode(e)
            return report
Beispiel #2
0
    def _follow_redirect(self, uri, method, body, headers, response, content, max_redirects):
        """ Internal function to follow a redirect recieved by L{request} """
        (scheme, authority, absolute_uri, defrag_uri) = httplib2.urlnorm(httplib2.iri2uri(uri))
        if self.cache:
            cachekey = defrag_uri
        else:
            cachekey = None

        # Pick out the location header and basically start from the beginning
        # remembering first to strip the ETag header and decrement our 'depth'
        if not response.has_key('location') and response.status != 300:
            raise httplib2.RedirectMissingLocation("Redirected but the response is missing a Location: header.", response, content)
        # Fix-up relative redirects (which violate an RFC 2616 MUST)
        if response.has_key('location'):
            location = response['location']
            (scheme, authority, path, query, fragment) = httplib2.parse_uri(location)
            if authority == None:
                response['location'] = httplib2.urlparse.urljoin(uri, location)
                logging.debug('Relative redirect: changed [%s] to [%s]' % (location, response['location']))
        if response.status == 301 and method in ["GET", "HEAD"]:
            response['-x-permanent-redirect-url'] = response['location']
            if not response.has_key('content-location'):
                response['content-location'] = absolute_uri 
            httplib2._updateCache(headers, response, content, self.cache, cachekey)
        
        headers.pop('if-none-match', None)
        headers.pop('if-modified-since', None)
        
        if response.has_key('location'):
            location = response['location']
            redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method
            return self.request(location, redirect_method, body=body, headers = headers, max_redirects = max_redirects - 1)
        else:
            raise httplib2.RedirectLimit("Redirected more times than redirection_limit allows.", response, content)
Beispiel #3
0
 def testFromStd66(self):
     self.assertEqual( ('http', 'example.com', '', None, None ), httplib2.parse_uri("http://example.com"))
     self.assertEqual( ('https', 'example.com', '', None, None ), httplib2.parse_uri("https://example.com"))
     self.assertEqual( ('https', 'example.com:8080', '', None, None ), httplib2.parse_uri("https://example.com:8080"))
     self.assertEqual( ('http', 'example.com', '/', None, None ), httplib2.parse_uri("http://example.com/"))
     self.assertEqual( ('http', 'example.com', '/path', None, None ), httplib2.parse_uri("http://example.com/path"))
     self.assertEqual( ('http', 'example.com', '/path', 'a=1&b=2', None ), httplib2.parse_uri("http://example.com/path?a=1&b=2"))
     self.assertEqual( ('http', 'example.com', '/path', 'a=1&b=2', 'fred' ), httplib2.parse_uri("http://example.com/path?a=1&b=2#fred"))
     self.assertEqual( ('http', 'example.com', '/path', 'a=1&b=2', 'fred' ), httplib2.parse_uri("http://example.com/path?a=1&b=2#fred"))
Beispiel #4
0
    def _follow_redirect(self, uri, method, body, headers, response, content,
                         max_redirects):
        """Internal function to follow a redirect recieved by L{request}"""
        (scheme, authority, absolute_uri,
         defrag_uri) = httplib2.urlnorm(httplib2.iri2uri(uri))
        if self.cache:
            cachekey = defrag_uri
        else:
            cachekey = None

        # Pick out the location header and basically start from the beginning
        # remembering first to strip the ETag header and decrement our 'depth'
        if "location" not in response and response.status != 300:
            raise httplib2.RedirectMissingLocation(
                "Redirected but the response is missing a Location: header.",
                response, content)
        # Fix-up relative redirects (which violate an RFC 2616 MUST)
        if "location" in response:
            location = response['location']
            (scheme, authority, path, query,
             fragment) = httplib2.parse_uri(location)
            if authority is None:
                response['location'] = httplib2.urlparse.urljoin(uri, location)
                pywikibot.debug(
                    u"Relative redirect: changed [%s] to [%s]" %
                    (location, response['location']), _logger)
        if response.status == 301 and method in ["GET", "HEAD"]:
            response['-x-permanent-redirect-url'] = response['location']
            if "content-location" not in response:
                response['content-location'] = absolute_uri
            httplib2._updateCache(headers, response, content, self.cache,
                                  cachekey)

        headers.pop('if-none-match', None)
        headers.pop('if-modified-since', None)

        if "location" in response:
            location = response['location']
            redirect_method = (
                (response.status == 303) and
                (method not in ["GET", "HEAD"])) and "GET" or method
            return self.request(location,
                                redirect_method,
                                body=body,
                                headers=headers,
                                max_redirects=max_redirects - 1)
        else:
            raise httplib2.RedirectLimit(
                "Redirected more times than redirection_limit allows.",
                response, content)
Beispiel #5
0
def test_from_std66():
    cases = (
        ('http://example.com', ('http', 'example.com', '', None, None)),
        ('https://example.com', ('https', 'example.com', '', None, None)),
        ('https://example.com:8080', ('https', 'example.com:8080', '', None,
                                      None)),
        ('http://example.com/', ('http', 'example.com', '/', None, None)),
        ('http://example.com/path', ('http', 'example.com', '/path', None,
                                     None)),
        ('http://example.com/path?a=1&b=2', ('http', 'example.com', '/path',
                                             'a=1&b=2', None)),
        ('http://example.com/path?a=1&b=2#fred', ('http', 'example.com',
                                                  '/path', 'a=1&b=2', 'fred')),
        ('http://example.com/path?a=1&b=2#fred', ('http', 'example.com',
                                                  '/path', 'a=1&b=2', 'fred')),
    )
    for a, b in cases:
        assert httplib2.parse_uri(a) == b
Beispiel #6
0
def test_from_std66():
    cases = (
        ('http://example.com',
            ('http', 'example.com', '', None, None)),
        ('https://example.com',
            ('https', 'example.com', '', None, None)),
        ('https://example.com:8080',
            ('https', 'example.com:8080', '', None, None)),
        ('http://example.com/',
            ('http', 'example.com', '/', None, None)),
        ('http://example.com/path',
            ('http', 'example.com', '/path', None, None)),
        ('http://example.com/path?a=1&b=2',
            ('http', 'example.com', '/path', 'a=1&b=2', None)),
        ('http://example.com/path?a=1&b=2#fred',
            ('http', 'example.com', '/path', 'a=1&b=2', 'fred')),
        ('http://example.com/path?a=1&b=2#fred',
            ('http', 'example.com', '/path', 'a=1&b=2', 'fred')),
    )
    for a, b in cases:
        assert httplib2.parse_uri(a) == b
Beispiel #7
0
def test_from_std66():
    cases = (
        ("http://example.com", ("http", "example.com", "", None, None)),
        ("https://example.com", ("https", "example.com", "", None, None)),
        ("https://example.com:8080", ("https", "example.com:8080", "", None, None)),
        ("http://example.com/", ("http", "example.com", "/", None, None)),
        ("http://example.com/path", ("http", "example.com", "/path", None, None)),
        (
            "http://example.com/path?a=1&b=2",
            ("http", "example.com", "/path", "a=1&b=2", None),
        ),
        (
            "http://example.com/path?a=1&b=2#fred",
            ("http", "example.com", "/path", "a=1&b=2", "fred"),
        ),
        (
            "http://example.com/path?a=1&b=2#fred",
            ("http", "example.com", "/path", "a=1&b=2", "fred"),
        ),
    )
    for a, b in cases:
        assert httplib2.parse_uri(a) == b
Beispiel #8
0
def test_from_std66():
    cases = (
        ("http://example.com", ("http", "example.com", "", None, None)),
        ("https://example.com", ("https", "example.com", "", None, None)),
        ("https://example.com:8080", ("https", "example.com:8080", "", None,
                                      None)),
        ("http://example.com/", ("http", "example.com", "/", None, None)),
        ("http://example.com/path", ("http", "example.com", "/path", None,
                                     None)),
        (
            "http://example.com/path?a=1&b=2",
            ("http", "example.com", "/path", "a=1&b=2", None),
        ),
        (
            "http://example.com/path?a=1&b=2#fred",
            ("http", "example.com", "/path", "a=1&b=2", "fred"),
        ),
        (
            "http://example.com/path?a=1&b=2#fred",
            ("http", "example.com", "/path", "a=1&b=2", "fred"),
        ),
    )
    for a, b in cases:
        assert httplib2.parse_uri(a) == b
Beispiel #9
0
  def OverrideRequest(self, conn, host, absolute_uri, request_uri, method,
                      body, headers, redirections, cachekey):
    """Do the actual request using the connection object.

    Also follow one level of redirects if necessary.
    """

    auths = ([(auth.depth(request_uri), auth) for auth in self.authorizations
              if auth.inscope(host, request_uri)])
    auth = auths and sorted(auths)[0][1] or None
    if auth:
      auth.request(method, request_uri, headers, body)

    (response, content) = self._conn_request(conn, request_uri, method, body,
                                             headers)

    if auth:
      if auth.response(response, body):
        auth.request(method, request_uri, headers, body)
        (response, content) = self._conn_request(conn, request_uri, method,
                                                 body, headers)
        response._stale_digest = 1

    if response.status == 401:
      for authorization in self._auth_from_challenge(
          host, request_uri, headers, response, content):
        authorization.request(method, request_uri, headers, body)
        (response, content) = self._conn_request(conn, request_uri, method,
                                                 body, headers)
        if response.status != 401:
          self.authorizations.append(authorization)
          authorization.response(response, body)
          break

    if (self.follow_all_redirects or (method in ["GET", "HEAD"])
        or response.status == 303):
      if self.follow_redirects and response.status in [300, 301, 302,
                                                       303, 307]:
        # Pick out the location header and basically start from the beginning
        # remembering first to strip the ETag header and decrement our 'depth'
        if redirections:
          if not response.has_key('location') and response.status != 300:
            raise httplib2.RedirectMissingLocation(
                "Redirected but the response is missing a Location: header.",
                response, content)
          # Fix-up relative redirects (which violate an RFC 2616 MUST)
          if response.has_key('location'):
            location = response['location']
            (scheme, authority, path, query, fragment) = parse_uri(location)
            if authority == None:
              response['location'] = urlparse.urljoin(absolute_uri, location)
          if response.status == 301 and method in ["GET", "HEAD"]:
            response['-x-permanent-redirect-url'] = response['location']
            if not response.has_key('content-location'):
              response['content-location'] = absolute_uri
            httplib2._updateCache(headers, response, content, self.cache,
                                  cachekey)
          if headers.has_key('if-none-match'):
            del headers['if-none-match']
          if headers.has_key('if-modified-since'):
            del headers['if-modified-since']
          if ('authorization' in headers and
              not self.forward_authorization_headers):
            del headers['authorization']
          if response.has_key('location'):
            location = response['location']
            old_response = copy.deepcopy(response)
            if not old_response.has_key('content-location'):
              old_response['content-location'] = absolute_uri
            redirect_method = method
            if response.status in [302, 303]:
              redirect_method = "GET"
              body = None
            (response, content) = self.request(
                location, redirect_method, body=body, headers=headers,
                redirections=redirections-1,
                connection_type=conn.__class__)
            response.previous = old_response
        else:
          raise httplib2.RedirectLimit(
              "Redirected more times than redirection_limit allows.",
              response, content)
      elif response.status in [200, 203] and method in ["GET", "HEAD"]:
        # Don't cache 206's since we aren't going to handle byte range
        # requests
        if not response.has_key('content-location'):
          response['content-location'] = absolute_uri
        httplib2._updateCache(headers, response, content, self.cache,
                              cachekey)

    return (response, content)
    def OverrideRequest(self, conn, host, absolute_uri, request_uri, method,
                        body, headers, redirections, cachekey):
        """Do the actual request using the connection object.

    Also follow one level of redirects if necessary.
    """

        auths = ([(auth.depth(request_uri), auth)
                  for auth in self.authorizations
                  if auth.inscope(host, request_uri)])
        auth = auths and sorted(auths)[0][1] or None
        if auth:
            auth.request(method, request_uri, headers, body)

        (response, content) = self._conn_request(conn, request_uri, method,
                                                 body, headers)

        if auth:
            if auth.response(response, body):
                auth.request(method, request_uri, headers, body)
                (response,
                 content) = self._conn_request(conn, request_uri, method, body,
                                               headers)
                response._stale_digest = 1

        if response.status == 401:
            for authorization in self._auth_from_challenge(
                    host, request_uri, headers, response, content):
                authorization.request(method, request_uri, headers, body)
                (response,
                 content) = self._conn_request(conn, request_uri, method, body,
                                               headers)
                if response.status != 401:
                    self.authorizations.append(authorization)
                    authorization.response(response, body)
                    break

        if (self.follow_all_redirects or (method in ["GET", "HEAD"])
                or response.status == 303):
            if self.follow_redirects and response.status in [
                    300, 301, 302, 303, 307
            ]:
                # Pick out the location header and basically start from the beginning
                # remembering first to strip the ETag header and decrement our 'depth'
                if redirections:
                    if not response.has_key(
                            'location') and response.status != 300:
                        raise httplib2.RedirectMissingLocation(
                            "Redirected but the response is missing a Location: header.",
                            response, content)
                    # Fix-up relative redirects (which violate an RFC 2616 MUST)
                    if response.has_key('location'):
                        location = response['location']
                        (scheme, authority, path, query,
                         fragment) = parse_uri(location)
                        if authority == None:
                            response['location'] = urlparse.urljoin(
                                absolute_uri, location)
                    if response.status == 301 and method in ["GET", "HEAD"]:
                        response['-x-permanent-redirect-url'] = response[
                            'location']
                        if not response.has_key('content-location'):
                            response['content-location'] = absolute_uri
                        httplib2._updateCache(headers, response, content,
                                              self.cache, cachekey)
                    if headers.has_key('if-none-match'):
                        del headers['if-none-match']
                    if headers.has_key('if-modified-since'):
                        del headers['if-modified-since']
                    if ('authorization' in headers
                            and not self.forward_authorization_headers):
                        del headers['authorization']
                    if response.has_key('location'):
                        location = response['location']
                        old_response = copy.deepcopy(response)
                        if not old_response.has_key('content-location'):
                            old_response['content-location'] = absolute_uri
                        redirect_method = method
                        if response.status in [302, 303]:
                            redirect_method = "GET"
                            body = None
                        (response, content) = self.request(
                            location,
                            redirect_method,
                            body=body,
                            headers=headers,
                            redirections=redirections - 1,
                            connection_type=conn.__class__)
                        response.previous = old_response
                else:
                    raise httplib2.RedirectLimit(
                        "Redirected more times than redirection_limit allows.",
                        response, content)
            elif response.status in [200, 203] and method in ["GET", "HEAD"]:
                # Don't cache 206's since we aren't going to handle byte range
                # requests
                if not response.has_key('content-location'):
                    response['content-location'] = absolute_uri
                httplib2._updateCache(headers, response, content, self.cache,
                                      cachekey)

        return (response, content)