Esempio n. 1
0
class TorProxyMiddleware(object):
    '''This middleware enables Tor to serve as connection proxies'''
    def __init__(self, crawler: Crawler, max_count: int,
                 allow_reuse_ip_after: int):
        '''Creates a new instance of TorProxyMiddleware
        
        Keywords arguments:
            max_count -- Maximum IP usage
            allow_reuse_ip_after -- When an IP can be reused
        '''

        self.crawler = crawler
        self.max_count = max_count

        self.tor_ip_changer = TorIpChanger(
            reuse_threshold=allow_reuse_ip_after)
        self.tor_ip_changer.get_new_ip()

        self.items_scraped = 0

    @classmethod
    def from_crawler(cls, crawler: Crawler):
        if not crawler.settings.getbool('TOR_IPROTATOR_ENABLED', False):
            raise NotConfigured()

        max_count = crawler.settings.getint('TOR_IPROTATOR_CHANGE_AFTER', 1000)
        allow_reuse_ip_after = crawler.settings.getint(
            'TOR_IPROTATOR_ALLOW_REUSE_IP_AFTER', 10)

        mw = cls(crawler=crawler,
                 max_count=max_count,
                 allow_reuse_ip_after=allow_reuse_ip_after)

        return mw

    def process_request(self, request: Request, spider: Spider) -> None:
        if self.items_scraped >= self.max_count:
            spider.log('Changing Tor IP...')
            self.items_scraped = 0

            self.crawler.engine.pause()
            new_ip = self.tor_ip_changer.get_new_ip()
            self.crawler.engine.unpause()

            if not new_ip:
                raise Exception('FatalError: Failed to find a new IP')

            spider.log(f'New Tor IP: {new_ip}')

        # http://127.0.0.1:8118 is the default address for Privoxy
        request.meta['proxy'] = 'http://127.0.0.1:8118'
        self.items_scraped += 1
Esempio n. 2
0
#  Python powered way to get a unique Tor IP

#  Docs:  https://pypi.org/project/toripchanger/

#  pip install toripchanger

# Basic example
from toripchanger import TorIpChanger

# Tor IP reuse is prohibited.
tor_ip_changer_0 = TorIpChanger(reuse_threshold=0)
current_ip = tor_ip_changer_0.get_new_ip()

# Current Tor IP address can be reused after one other IP was used (default setting).
tor_ip_changer_1 = TorIpChanger(local_http_proxy='127.0.0.1:8888')
current_ip = tor_ip_changer_1.get_new_ip()

# Current Tor IP address can be reused after 5 other Tor IPs were used.
tor_ip_changer_5 = TorIpChanger(tor_address="localhost", reuse_threshold=5)
current_ip = tor_ip_changer_5.get_new_ip()
Esempio n. 3
0
class AntiblockDriver():
    """
    General implementation for anti-blocking procedures. The _send_request
    method should be used by subclasses to send a request with anti-blocking
    mechanisms in place. The other methods can be used for cases that require
    more flexibility.
    """
    def __validate_user_agent_config(self):
        """
        Validate the user-agent configurations, raising an error if necessary
        """

        # Validate the list of user-agents
        if not isinstance(self.user_agent_list, list) or \
           len(self.user_agent_list) == 0:
            raise ValueError(
                ('If user-agent rotation in enabled, a '
                 'non-empty list of user-agents must be supplied.'))

        # Validate the minimum UA usage
        if not isinstance(self.ua_rotate_min_usage, int) or \
           self.ua_rotate_min_usage <= 0:
            raise TypeError(('The minimum user-agent usage should be a '
                             'positive integer'))

        # Validate the maximum UA usage
        if not isinstance(self.ua_rotate_max_usage, int) or \
           self.ua_rotate_max_usage <= 0:
            raise TypeError(('The maximum user-agent usage should be a '
                             'positive integer'))

        # Validate the overall range of possible UA usage values
        if self.ua_rotate_min_usage > self.ua_rotate_max_usage:
            raise ValueError('The maximum user-agent usage should be '
                             'greater than the minimum usage.')

    def __validate_delay_config(self):
        """
        Validate the delay configurations, raising an error if necessary
        """

        if not isinstance(self.download_delay, (int, float)) or \
           self.download_delay < 0:
            raise ValueError('The download delay should be a positive number.')

    def __validate_autothrottle_config(self):
        """
        Validate the autothrottle configurations, raising an error if
        necessary
        """

        if not isinstance(self.at_start_delay, (int, float)) or \
                self.at_start_delay < 0:
            raise ValueError('The autothrottle start delay should be a '
                             'positive number.')
        if not isinstance(self.at_max_delay, (int, float)) or \
                self.at_max_delay < 0:
            raise ValueError('The autothrottle maximum delay should be a '
                             'positive number.')

    def __validate_ip_tor_config(self):
        """
        Validate the ip rotation configurations when using tor, raising an
        error if necessary
        """

        if not isinstance(self.ip_change_after, int) or \
                self.ip_change_after < 0:
            raise ValueError('The number of times an IP can be used in '
                             'succession should be a positive integer.')
        if not isinstance(self.ip_reuse_after, int) or self.ip_reuse_after < 0:
            raise ValueError('The number of different IPs to be used before '
                             'repeating should be a positive number.')

    def __validate_ip_proxy_config(self):
        """
        Validate the ip rotation configurations when using proxies, raising an
        error if necessary
        """

        if not isinstance(self.proxy_list, list):
            raise ValueError('A valid list of proxies must be supplied.')

    def __validate_cookie_config(self):
        """
        Validate the cookie injection configurations, raising an error if
        necessary
        """

        if not isinstance(self.cookies, list):
            raise ValueError('A valid list of cookies must be supplied.')

    def __setup_ip_rotation(self, antiblock_config: Dict[str, Any] = {}):
        """
        Setup the configurations for the ip rotation
        """

        rot_type = antiblock_config["iprotator_type"]
        self.ip_rotation_type = rot_type
        if rot_type == 'tor':
            self.ip_change_after = antiblock_config\
                .get('tor_iprotator_change_after', 1)
            self.ip_reuse_after = antiblock_config\
                .get('tor_iprotator_allow_reuse_ip_after', 10)
            self.__validate_ip_tor_config()

            self.tor_controller = TorIpChanger(
                reuse_threshold=self.ip_reuse_after)
            self.tor_controller.get_new_ip()
        elif rot_type == 'proxy':
            self.proxy_list = antiblock_config.get('iprotator_proxy_list', [])
            self.__validate_ip_proxy_config()
        else:
            raise ValueError('Invalid ip rotation type: ' + rot_type)

    def __init__(self, antiblock_config: Dict[str, Any] = {}):
        """
        Constructor for the generic antiblock driver.

        :param antiblock_config: Dictionary of configuration parameters for the
                                 antiblock measures
        """
        self.ua_items_scraped = 0
        self.ip_items_scraped = 0

        self.ua_rotate = antiblock_config\
            .get('rotate_user_agent_enabled', False)

        if self.ua_rotate:
            self.user_agent_list = antiblock_config.get('user_agents', [])

            self.ua_rotate_min_usage = antiblock_config\
                .get('min_user_agent_usage', 1)
            self.ua_rotate_max_usage = antiblock_config\
                .get('max_user_agent_usage', self.ua_rotate_min_usage)

            self.ua_rotate_limit_usage = random\
                .randint(self.ua_rotate_min_usage, self.ua_rotate_max_usage)

            self.__validate_user_agent_config()

            self.user_agents = cycle(self.user_agent_list)
            self.user_agent = next(self.user_agents)

        self.time_last_request = None
        self.current_delay = None
        self.download_delay = antiblock_config.get('download_delay', 0.25)
        self.randomize_delay = antiblock_config\
            .get('download_delay_randomize', True)
        self.__validate_delay_config()

        self.at_enabled = antiblock_config.get('autothrottle_enabled', False)
        if self.at_enabled:
            self.at_start_delay = antiblock_config\
                .get('autothrottle_start_delay', 5)
            self.at_max_delay = antiblock_config\
                .get('autothrottle_max_delay', 60)
            self.__validate_autothrottle_config()

        self.current_delay = 0

        self.ip_rotate = antiblock_config.get('iprotator_enabled', False)
        if self.ip_rotate:
            self.__setup_ip_rotation(antiblock_config)

        self.insert_cookies = antiblock_config.get('insert_cookies', False)
        if self.insert_cookies:
            self.cookies = antiblock_config.get('cookies', [])
            self.__validate_cookie_config()

    def _generate_next_delay(self,
                             response_latency: float = 0,
                             last_status: int = 0):
        """
        Generates the value for the delay to be applied before doing the next
        request.

        :param response_latency: time taken by the last request in seconds
        :param last_status:      HTTP status received from the last request
        """
        if self.at_enabled:
            # Autothrottle
            if self.current_delay is None or self.time_last_request is None:
                self.current_delay = self.at_start_delay
            else:
                next_delay = (response_latency + self.current_delay) / 2

                # Non-200 responses can't decrease the delay
                if last_status == 200 or next_delay > self.current_delay:
                    # Clamp delay between values supplied by the user
                    min_delay = self.download_delay
                    max_delay = self.at_max_delay
                    clamped = max(min_delay, min(max_delay, next_delay))

                    self.current_delay = clamped
        else:
            # Normal delay
            if self.randomize_delay:
                self.current_delay = self.download_delay * \
                    random.uniform(0.5, 1.5)
            else:
                self.current_delay = self.download_delay

    def _get_current_user_agent(self) -> Optional[str]:
        """
        Get the current user agent to use, and apply the rotation if necessary

        :returns: A string representing the user-agent to use for the next
                  request, or None if user-agent rotation is disabled
        """
        if self.ua_rotate:
            if self.ua_items_scraped >= self.ua_rotate_limit_usage:
                self.ua_items_scraped = 0
                self.ua_rotate_limit_usage = random.randint(
                    self.ua_rotate_min_usage, self.ua_rotate_max_usage)

                self.user_agent = next(self.user_agents)

            self.ua_items_scraped += 1
            return self.user_agent
        else:
            return None

    def _apply_delay(self):
        """
        Wait for the configured amount of time, previously calculated by the
        _generate_next_delay method.
        """

        last_req = self.time_last_request
        elapsed = None
        if last_req is None:
            elapsed = self.current_delay
        else:
            elapsed = time.perf_counter() - self.time_last_request

        if self.time_last_request is None or elapsed < self.current_delay:
            # Wait for the remaining time
            remaining = self.current_delay - elapsed
            time.sleep(remaining)

    def _generate_headers(self, headers: Dict[str, Any] = {}):
        """
        Generate the headers for the next request, with the correct user-agent
        value.

        :param headers: Dictionary of extra values to be included in the header

        :returns: The headers for the next request
        """

        user_agent = self._get_current_user_agent()

        if self.ua_rotate and user_agent is not None:
            headers['User-Agent'] = user_agent

        if not bool(headers):
            headers = None

        return headers

    def _generate_proxies(self, proxies: Dict[str, Any] = {}):
        """
        Generate the proxies for the next request, considering the given list
        or the Tor configuration, if supplied.

        :param proxies: Dictionary of possible default values for the proxies

        :returns: The proxies to be used by the next request
        """
        if self.ip_rotate:
            if self.ip_rotation_type == 'tor':
                if self.ip_items_scraped >= self.ip_change_after:
                    logging.info('Changing Tor IP...')
                    self.ip_items_scraped = 0

                    new_ip = self.tor_controller.get_new_ip()
                    if not new_ip:
                        raise Exception('FatalError: Failed to find a new IP')

                    logging.info(f'New Tor IP: {new_ip}')

                proxies = {'http': '127.0.0.1:8118', 'https': '127.0.0.1:8118'}

            elif self.ip_rotation_type == 'proxy':
                proxy_len = len(self.proxy_list)
                proxies = {
                    'http': self.proxy_list[self.ip_items_scraped % proxy_len],
                    'https': self.proxy_list[self.ip_items_scraped % proxy_len]
                }
            self.ip_items_scraped += 1

        return proxies if bool(proxies) else None

    def _generate_cookies(self, cookies: Dict[str, Any] = {}):
        """
        Generate the cookies for the next request.

        :param cookies: Dictionary of extra cookies to be included

        :returns: The cookies to be sent by the next request
        """
        if self.insert_cookies:
            for x in self.cookies:
                cookies = {**cookies, **x}

        return cookies if bool(cookies) else None

    def _send_request(self, req_function: Callable, *args, **kwargs) -> Any:
        """
        Apply all configured anti-blocking mechanisms and call the request
        function supplied.

        :param req_function: The function to be called to actually send the
                             request. It should take at least three named
                             arguments: headers, proxies and cookies, which
                             represent the respective values to be inserted.
                             Any extra values passed to this method are
                             redirected to the req_function.

        :returns: The response received from the supplied function
        """

        headers = self._generate_headers(kwargs.get('headers', {}))
        if 'headers' in kwargs:
            del kwargs['headers']

        proxies = self._generate_proxies(kwargs.get('proxies', {}))
        if 'proxies' in kwargs:
            del kwargs['proxies']

        cookies = self._generate_cookies(kwargs.get('cookies', {}))
        if 'cookies' in kwargs:
            del kwargs['cookies']

        self._apply_delay()

        response = req_function(headers=headers,
                                proxies=proxies,
                                cookies=cookies,
                                *args,
                                **kwargs)

        # Calculate next delay value
        self._generate_next_delay(response.elapsed.total_seconds(),
                                  response.status_code)

        self.time_last_request = time.perf_counter()

        return response
Esempio n. 4
0
import time
from toripchanger import TorIpChanger

ip_changer = TorIpChanger(tor_password='******',
                          tor_port=9051,
                          local_http_proxy='127.0.0.1:8118')

for i in range(10):
    new_ip = ip_changer.get_new_ip()

    print("New IP: " + new_ip)

    time.sleep(5)