Beispiel #1
0
    def __next__(self):
        self.condition.acquire()

        try:
            path = self.entries[self.index]
        except IndexError:
            self.condition.release()
            raise StopIteration

        self.index += 1
        self.condition.release()

        return safequote(path)
Beispiel #2
0
 def __next__(self, base_path=None):
     _, path = self.next_with_index(base_path)
     return safequote(path)
Beispiel #3
0
    def __init__(
        self,
        url,
        max_pool=1,
        max_retries=5,
        timeout=20,
        ip=None,
        proxy=None,
        proxylist=None,
        redirect=False,
        request_by_hostname=False,
        httpmethod="get",
        data=None,
        scheme=None,
        random_agents=None,
    ):
        self.httpmethod = httpmethod
        self.data = data
        self.headers = {}

        parsed = urlparse(url)

        # If no scheme specified, unset it first
        if "://" not in url:
            parsed = urlparse("{0}://{1}".format(scheme or "unknown", url))

        self.base_path = parsed.path
        if parsed.path.startswith("/"):
            self.base_path = parsed.path[1:]

        # Safe quote all special characters in base_path to prevent from being encoded
        self.base_path = safequote(self.base_path)
        self.host = parsed.netloc.split(":")[0]

        port_for_scheme = {"http": 80, "https": 443, "unknown": 0}

        if parsed.scheme not in ("unknown", "https", "http"):
            raise RequestException("Unsupported URI scheme: {0}".format(
                self.scheme))

        # If no port specified, set default (80, 443)
        try:
            self.port = int(parsed.netloc.split(":")[1])
        except IndexError:
            self.port = port_for_scheme[parsed.scheme]
        except ValueError:
            raise RequestException("Invalid port number: {0}".format(
                parsed.netloc.split(":")[1]))

        # If no scheme is found, detect it by port number
        self.scheme = parsed.scheme if parsed.scheme != "unknown" else self.get_scheme(
            self.port)

        # If the user neither provide the port nor scheme, guess them based
        # on standard website characteristics
        if not self.scheme:
            self.scheme = "https" if self.get_scheme(
                443) == "https" else "http"
            self.port = port_for_scheme[self.scheme]

        # Set the Host header, read the line 126 to know why
        self.headers["Host"] = self.host

        # Include port in Host header if it's non-standard
        if (self.scheme == "https"
                and self.port != 443) or (self.scheme == "http"
                                          and self.port != 80):
            self.headers["Host"] += ":{0}".format(self.port)

        self.max_retries = max_retries
        self.max_pool = max_pool
        self.timeout = timeout
        self.pool = None
        self.proxy = proxy
        self.proxylist = proxylist
        self.redirect = redirect
        self.random_agents = random_agents
        self.auth = None
        self.request_by_hostname = request_by_hostname
        self.ip = ip
        self.base_url = self.url = "{0}://{1}/".format(
            self.scheme,
            self.headers["Host"],
        )
Beispiel #4
0
    def __init__(
        self,
        url,
        max_pool=1,
        max_retries=5,
        timeout=20,
        ip=None,
        proxy=None,
        proxylist=None,
        redirect=False,
        request_by_hostname=False,
        httpmethod="get",
        data=None,
        scheme=None,
    ):
        self.httpmethod = httpmethod
        self.data = data
        self.headers = {}

        parsed = urlparse(url)

        # If no protocol specified, set http by default
        if "://" not in url:
            parsed = urlparse("{0}://{1}".format(scheme, url))

        # If protocol is not supported
        elif parsed.scheme not in ["https", "http"]:
            raise RequestException({
                "message":
                "Unsupported URL scheme: {0}".format(parsed.scheme)
            })

        self.base_path = parsed.path
        if parsed.path.startswith("/"):
            self.base_path = parsed.path[1:]

        # Safe quote all special characters in base_path to prevent from being encoded
        self.base_path = safequote(self.base_path)
        self.protocol = parsed.scheme
        self.host = parsed.netloc.split(":")[0]

        # Resolve DNS to decrease overhead
        if ip:
            self.ip = ip
        # A proxy could have a different DNS that would resolve the name. ThereFore.
        # resolving the name when using proxy to raise an error is pointless
        elif not proxy and not proxylist:
            try:
                self.ip = socket.gethostbyname(self.host)
            except socket.gaierror:
                # Check if hostname resolves to IPv6 address only
                try:
                    self.ip = socket.getaddrinfo(self.host, None,
                                                 socket.AF_INET6)[0][4][0]
                except socket.gaierror:
                    raise RequestException({"message": "Couldn't resolve DNS"})

        # If no port specified, set default (80, 443)
        try:
            self.port = int(parsed.netloc.split(":")[1])
        except IndexError:
            self.port = 443 if self.protocol == "https" else 80
        except ValueError:
            raise RequestException({
                "message":
                "Invalid port number: {0}".format(parsed.netloc.split(":")[1])
            })

        # Set the Host header, this will be overwritten if the user has already set the header
        self.headers["Host"] = self.host

        # Include port in Host header if it's non-standard
        if (self.protocol == "https"
                and self.port != 443) or (self.protocol == "http"
                                          and self.port != 80):
            self.headers["Host"] += ":{0}".format(self.port)

        self.max_retries = max_retries
        self.max_pool = max_pool
        self.timeout = timeout
        self.pool = None
        self.proxy = proxy
        self.proxylist = proxylist
        self.redirect = redirect
        self.random_agents = None
        self.auth = None
        self.request_by_hostname = request_by_hostname
        self.session = requests.Session()
        self.url = "{0}://{1}:{2}/".format(
            self.protocol,
            self.host if self.request_by_hostname else self.ip,
            self.port,
        )
        self.base_url = "{0}://{1}:{2}/".format(
            self.protocol,
            self.host,
            self.port,
        )
        self.set_adapter()