Ejemplo n.º 1
0
class TorProxyMiddleware(object):
    '''This middleware enables Tor to serve as connection proxies'''
    def __init__(self, crawler: Crawler, max_count: int,
                 allow_reuse_ip_after: int):
        '''Creates a new instance of TorProxyMiddleware
        
        Keywords arguments:
            max_count -- Maximum IP usage
            allow_reuse_ip_after -- When an IP can be reused
        '''

        self.crawler = crawler
        self.max_count = max_count

        self.tor_ip_changer = TorIpChanger(
            reuse_threshold=allow_reuse_ip_after)
        self.tor_ip_changer.get_new_ip()

        self.items_scraped = 0

    @classmethod
    def from_crawler(cls, crawler: Crawler):
        if not crawler.settings.getbool('TOR_IPROTATOR_ENABLED', False):
            raise NotConfigured()

        max_count = crawler.settings.getint('TOR_IPROTATOR_CHANGE_AFTER', 1000)
        allow_reuse_ip_after = crawler.settings.getint(
            'TOR_IPROTATOR_ALLOW_REUSE_IP_AFTER', 10)

        mw = cls(crawler=crawler,
                 max_count=max_count,
                 allow_reuse_ip_after=allow_reuse_ip_after)

        return mw

    def process_request(self, request: Request, spider: Spider) -> None:
        if self.items_scraped >= self.max_count:
            spider.log('Changing Tor IP...')
            self.items_scraped = 0

            self.crawler.engine.pause()
            new_ip = self.tor_ip_changer.get_new_ip()
            self.crawler.engine.unpause()

            if not new_ip:
                raise Exception('FatalError: Failed to find a new IP')

            spider.log(f'New Tor IP: {new_ip}')

        # http://127.0.0.1:8118 is the default address for Privoxy
        request.meta['proxy'] = 'http://127.0.0.1:8118'
        self.items_scraped += 1
Ejemplo n.º 2
0
    def __init__(self, crawler: Crawler, max_count: int,
                 allow_reuse_ip_after: int):
        '''Creates a new instance of TorProxyMiddleware
        
        Keywords arguments:
            max_count -- Maximum IP usage
            allow_reuse_ip_after -- When an IP can be reused
        '''

        self.crawler = crawler
        self.max_count = max_count

        self.tor_ip_changer = TorIpChanger(
            reuse_threshold=allow_reuse_ip_after)
        self.tor_ip_changer.get_new_ip()

        self.items_scraped = 0
Ejemplo n.º 3
0
    def __setup_ip_rotation(self, antiblock_config: Dict[str, Any] = {}):
        """
        Setup the configurations for the ip rotation
        """

        rot_type = antiblock_config["iprotator_type"]
        self.ip_rotation_type = rot_type
        if rot_type == 'tor':
            self.ip_change_after = antiblock_config\
                .get('tor_iprotator_change_after', 1)
            self.ip_reuse_after = antiblock_config\
                .get('tor_iprotator_allow_reuse_ip_after', 10)
            self.__validate_ip_tor_config()

            self.tor_controller = TorIpChanger(
                reuse_threshold=self.ip_reuse_after)
            self.tor_controller.get_new_ip()
        elif rot_type == 'proxy':
            self.proxy_list = antiblock_config.get('iprotator_proxy_list', [])
            self.__validate_ip_proxy_config()
        else:
            raise ValueError('Invalid ip rotation type: ' + rot_type)
Ejemplo n.º 4
0
from flask import jsonify
from toripchanger import TorIpChanger

from scrapemeagain.config import Config
from scrapemeagain.dockerized.utils import app_factory


app = app_factory(__name__)


# Global IP store (using only specific `TorIpChanger` functionality).
IPSTORE = TorIpChanger(reuse_threshold=Config.IPSTORE_REUSE_THRESHOLD)


@app.route("/ip-is-safe/<ip>/")
def ip_is_safe(ip):
    safe = IPSTORE._ip_is_safe(ip)
    if safe:
        IPSTORE._manage_used_ips(ip)

    return jsonify({"safe": safe})


if __name__ == "__main__":
    app.run(host="0.0.0.0", port=Config.IPSTORE_PORT)
Ejemplo n.º 5
0
                                 month=random.randint(1, 12),
                                 day=random.randint(1, 28))
        delta = datetime.timedelta(days=random.randint(8, 260))
        beign_date = end_date - delta
        timeframe = f"{beign_date.strftime('%Y-%m-%d')} {end_date.strftime('%Y-%m-%d')}"
        return timeframe

    while True:
        loc = random.choice(geo_list)
        search = random.choice(kw)
        try:
            current_ip = tor_ip_changer.get_new_ip()
        except:
            pass
        pytrends = TrendReq()
        print(loc)
        print(search)
        pytrends.build_payload(search,
                               cat=0,
                               timeframe=random_timeframe(),
                               geo=loc)
        df = pytrends.interest_over_time()


if __name__ == "__main__":

    tor_ip_changer = TorIpChanger(tor_password='******',
                                  tor_port=9051,
                                  local_http_proxy='127.0.0.1:8118')
    random_query()
Ejemplo n.º 6
0
# author = 'BlackSesion'
import base64
import json
import random
import re
import traceback
import urllib
import urllib2

import sys
from scrapy.exceptions import IgnoreRequest
from scrapy.conf import settings
from toripchanger import TorIpChanger

# A Tor IP will be reused only after 10 different IPs were used.
ip_changer = TorIpChanger(reuse_threshold=10)


class RandomUserAgentMiddleware(object):
    def process_request(self, request, spider):
        ua = random.choice(settings.get('USER_AGENT_LIST'))
        if ua:
            request.headers.setdefault('User-Agent', ua)


class ProxyMiddleware(object):
    # overwrite process request
    _requests_count = 0
    _requests_count_x_ip = 10

    def process_request(self, request, spider):
Ejemplo n.º 7
0
from scrapemeagain.config import Config
from scrapemeagain.databaser import Databaser
from scrapemeagain.scrapers.examplescraper2.custom_pipeline import (
    ExhaustApiLimitPipeLine,
)  # noqa
from scrapemeagain.scrapers.examplescraper2.scraper import ExampleScraper2
from scrapemeagain.utils import services
from scrapemeagain.utils.logger import setup_logging
from scrapemeagain.utils.useragents import get_user_agents


# Configure TorIpChanger.
tor_ip_changer = TorIpChanger(
    reuse_threshold=0,  # We need to remember all exhausted IPs.
    local_http_proxy=Config.LOCAL_HTTP_PROXY,
    tor_password=Config.TOR_PASSWORD,
    tor_port=Config.TOR_PORT,
    new_ip_max_attempts=Config.NEW_IP_MAX_ATTEMPTS,
)

# Configure useragents.
Config.USER_AGENTS = get_user_agents()

# Configure logging.
setup_logging(logger_name="example-scraper2")


# Prepare the scraping pipeline.
scraper = ExampleScraper2()
databaser = Databaser(scraper.db_file, scraper.db_table)
pipeline = ExhaustApiLimitPipeLine(scraper, databaser, tor_ip_changer)
Ejemplo n.º 8
0
 def set_new_ip(self):
     return TorIpChanger(
         reuse_threshold=0,
         tor_password='******',
         tor_port=9051,
         local_http_proxy=self.settings.get('HTTP_PROXY')).get_new_ip()
Ejemplo n.º 9
0
from stem.util.log import get_logger

# logger = get_logger()
# logger.propagate = False

# Default settings.
REUSE_THRESHOLD = 1
LOCAL_HTTP_PROXY = "127.0.0.1:8118"
NEW_IP_MAX_ATTEMPTS = 10
TOR_PASSWORD = "******"
TOR_ADDRESS = "127.0.0.1"
TOR_PORT = 9051
POST_NEW_IP_SLEEP = 0.5

ip_changer = TorIpChanger(reuse_threshold=REUSE_THRESHOLD,
                          tor_password=TOR_PASSWORD,
                          tor_port=TOR_PORT,
                          local_http_proxy=LOCAL_HTTP_PROXY)


# Send "Change IP" signal to tor control port
class TorMiddleware(object):
    def __init__(self):
        self.settings = get_project_settings()
        self._requests_count = 0
        self.controller = Controller.from_port(address=TOR_ADDRESS,
                                               port=TOR_PORT)
        self.controller.authenticate(password=TOR_PASSWORD)

    def set_new_ip(self):
        return TorIpChanger(
            reuse_threshold=0,
Ejemplo n.º 10
0
from scrapemeagain.config import Config
from scrapemeagain.databaser import Databaser
from scrapemeagain.pipeline import Pipeline
from scrapemeagain.utils import services
from scrapemeagain.utils.logger import setup_logging
from scrapemeagain.utils.useragents import get_user_agents

from examplescraper.scraper import ExampleScraper


# Configure TorIpChanger.
tor_ip_changer = TorIpChanger(
    reuse_threshold=Config.REUSE_THRESHOLD,
    local_http_proxy=Config.LOCAL_HTTP_PROXY,
    tor_password=Config.TOR_PASSWORD,
    tor_port=Config.TOR_PORT,
    new_ip_max_attempts=Config.NEW_IP_MAX_ATTEMPTS,
)

# Configure useragents.
Config.USER_AGENTS = get_user_agents(__file__)

# Configure logging.
setup_logging(logger_name="example-scraper")


# Prepare the scraping pipeline.
scraper = ExampleScraper()
databaser = Databaser(scraper.db_file, scraper.db_table)
pipeline = Pipeline(scraper, databaser, tor_ip_changer)
Ejemplo n.º 11
0
#https://stackoverflow.com/questions/43942689/error-while-receiving-a-control-message-socketclosed-empty-socket-content-i
from stem.util.log import get_logger
logger = get_logger()
logger.propagate = False

# https://stackoverflow.com/questions/45009940/scrapy-with-privoxy-and-tor-how-to-renew-ip/45010141
from toripchanger import TorIpChanger
from stem import Signal
from stem.control import Controller
# password handling
import os
from dotenv import load_dotenv
# TOR
TOR_PASSWORD = os.getenv('TOR_PASS')
# A Tor IP will be reused only after 10 different IPs were used.
ip_changer = TorIpChanger(tor_password=TOR_PASSWORD, reuse_threshold=10)


class ProxyMiddleware(object):
    """

    learning about to learn about TOR
    https://github.com/WiliTest/Anonymous-scrapping-Scrapy-Tor-Privoxy-UserAgent
    # setting TOR for the first time on Linux
    https://jarroba.com/anonymous-scraping-by-tor-network/
    config the to for the first time
    https://2019.www.torproject.org/docs/faq#torrc
    about TorIpChanger
    https://gist.github.com/DusanMadar/8d11026b7ce0bce6a67f7dd87b999f6b
    """
    _requests_count = 0
Ejemplo n.º 12
0
#  Python powered way to get a unique Tor IP

#  Docs:  https://pypi.org/project/toripchanger/

#  pip install toripchanger

# Basic example
from toripchanger import TorIpChanger

# Tor IP reuse is prohibited.
tor_ip_changer_0 = TorIpChanger(reuse_threshold=0)
current_ip = tor_ip_changer_0.get_new_ip()

# Current Tor IP address can be reused after one other IP was used (default setting).
tor_ip_changer_1 = TorIpChanger(local_http_proxy='127.0.0.1:8888')
current_ip = tor_ip_changer_1.get_new_ip()

# Current Tor IP address can be reused after 5 other Tor IPs were used.
tor_ip_changer_5 = TorIpChanger(tor_address="localhost", reuse_threshold=5)
current_ip = tor_ip_changer_5.get_new_ip()
Ejemplo n.º 13
0
class AntiblockDriver():
    """
    General implementation for anti-blocking procedures. The _send_request
    method should be used by subclasses to send a request with anti-blocking
    mechanisms in place. The other methods can be used for cases that require
    more flexibility.
    """
    def __validate_user_agent_config(self):
        """
        Validate the user-agent configurations, raising an error if necessary
        """

        # Validate the list of user-agents
        if not isinstance(self.user_agent_list, list) or \
           len(self.user_agent_list) == 0:
            raise ValueError(
                ('If user-agent rotation in enabled, a '
                 'non-empty list of user-agents must be supplied.'))

        # Validate the minimum UA usage
        if not isinstance(self.ua_rotate_min_usage, int) or \
           self.ua_rotate_min_usage <= 0:
            raise TypeError(('The minimum user-agent usage should be a '
                             'positive integer'))

        # Validate the maximum UA usage
        if not isinstance(self.ua_rotate_max_usage, int) or \
           self.ua_rotate_max_usage <= 0:
            raise TypeError(('The maximum user-agent usage should be a '
                             'positive integer'))

        # Validate the overall range of possible UA usage values
        if self.ua_rotate_min_usage > self.ua_rotate_max_usage:
            raise ValueError('The maximum user-agent usage should be '
                             'greater than the minimum usage.')

    def __validate_delay_config(self):
        """
        Validate the delay configurations, raising an error if necessary
        """

        if not isinstance(self.download_delay, (int, float)) or \
           self.download_delay < 0:
            raise ValueError('The download delay should be a positive number.')

    def __validate_autothrottle_config(self):
        """
        Validate the autothrottle configurations, raising an error if
        necessary
        """

        if not isinstance(self.at_start_delay, (int, float)) or \
                self.at_start_delay < 0:
            raise ValueError('The autothrottle start delay should be a '
                             'positive number.')
        if not isinstance(self.at_max_delay, (int, float)) or \
                self.at_max_delay < 0:
            raise ValueError('The autothrottle maximum delay should be a '
                             'positive number.')

    def __validate_ip_tor_config(self):
        """
        Validate the ip rotation configurations when using tor, raising an
        error if necessary
        """

        if not isinstance(self.ip_change_after, int) or \
                self.ip_change_after < 0:
            raise ValueError('The number of times an IP can be used in '
                             'succession should be a positive integer.')
        if not isinstance(self.ip_reuse_after, int) or self.ip_reuse_after < 0:
            raise ValueError('The number of different IPs to be used before '
                             'repeating should be a positive number.')

    def __validate_ip_proxy_config(self):
        """
        Validate the ip rotation configurations when using proxies, raising an
        error if necessary
        """

        if not isinstance(self.proxy_list, list):
            raise ValueError('A valid list of proxies must be supplied.')

    def __validate_cookie_config(self):
        """
        Validate the cookie injection configurations, raising an error if
        necessary
        """

        if not isinstance(self.cookies, list):
            raise ValueError('A valid list of cookies must be supplied.')

    def __setup_ip_rotation(self, antiblock_config: Dict[str, Any] = {}):
        """
        Setup the configurations for the ip rotation
        """

        rot_type = antiblock_config["iprotator_type"]
        self.ip_rotation_type = rot_type
        if rot_type == 'tor':
            self.ip_change_after = antiblock_config\
                .get('tor_iprotator_change_after', 1)
            self.ip_reuse_after = antiblock_config\
                .get('tor_iprotator_allow_reuse_ip_after', 10)
            self.__validate_ip_tor_config()

            self.tor_controller = TorIpChanger(
                reuse_threshold=self.ip_reuse_after)
            self.tor_controller.get_new_ip()
        elif rot_type == 'proxy':
            self.proxy_list = antiblock_config.get('iprotator_proxy_list', [])
            self.__validate_ip_proxy_config()
        else:
            raise ValueError('Invalid ip rotation type: ' + rot_type)

    def __init__(self, antiblock_config: Dict[str, Any] = {}):
        """
        Constructor for the generic antiblock driver.

        :param antiblock_config: Dictionary of configuration parameters for the
                                 antiblock measures
        """
        self.ua_items_scraped = 0
        self.ip_items_scraped = 0

        self.ua_rotate = antiblock_config\
            .get('rotate_user_agent_enabled', False)

        if self.ua_rotate:
            self.user_agent_list = antiblock_config.get('user_agents', [])

            self.ua_rotate_min_usage = antiblock_config\
                .get('min_user_agent_usage', 1)
            self.ua_rotate_max_usage = antiblock_config\
                .get('max_user_agent_usage', self.ua_rotate_min_usage)

            self.ua_rotate_limit_usage = random\
                .randint(self.ua_rotate_min_usage, self.ua_rotate_max_usage)

            self.__validate_user_agent_config()

            self.user_agents = cycle(self.user_agent_list)
            self.user_agent = next(self.user_agents)

        self.time_last_request = None
        self.current_delay = None
        self.download_delay = antiblock_config.get('download_delay', 0.25)
        self.randomize_delay = antiblock_config\
            .get('download_delay_randomize', True)
        self.__validate_delay_config()

        self.at_enabled = antiblock_config.get('autothrottle_enabled', False)
        if self.at_enabled:
            self.at_start_delay = antiblock_config\
                .get('autothrottle_start_delay', 5)
            self.at_max_delay = antiblock_config\
                .get('autothrottle_max_delay', 60)
            self.__validate_autothrottle_config()

        self.current_delay = 0

        self.ip_rotate = antiblock_config.get('iprotator_enabled', False)
        if self.ip_rotate:
            self.__setup_ip_rotation(antiblock_config)

        self.insert_cookies = antiblock_config.get('insert_cookies', False)
        if self.insert_cookies:
            self.cookies = antiblock_config.get('cookies', [])
            self.__validate_cookie_config()

    def _generate_next_delay(self,
                             response_latency: float = 0,
                             last_status: int = 0):
        """
        Generates the value for the delay to be applied before doing the next
        request.

        :param response_latency: time taken by the last request in seconds
        :param last_status:      HTTP status received from the last request
        """
        if self.at_enabled:
            # Autothrottle
            if self.current_delay is None or self.time_last_request is None:
                self.current_delay = self.at_start_delay
            else:
                next_delay = (response_latency + self.current_delay) / 2

                # Non-200 responses can't decrease the delay
                if last_status == 200 or next_delay > self.current_delay:
                    # Clamp delay between values supplied by the user
                    min_delay = self.download_delay
                    max_delay = self.at_max_delay
                    clamped = max(min_delay, min(max_delay, next_delay))

                    self.current_delay = clamped
        else:
            # Normal delay
            if self.randomize_delay:
                self.current_delay = self.download_delay * \
                    random.uniform(0.5, 1.5)
            else:
                self.current_delay = self.download_delay

    def _get_current_user_agent(self) -> Optional[str]:
        """
        Get the current user agent to use, and apply the rotation if necessary

        :returns: A string representing the user-agent to use for the next
                  request, or None if user-agent rotation is disabled
        """
        if self.ua_rotate:
            if self.ua_items_scraped >= self.ua_rotate_limit_usage:
                self.ua_items_scraped = 0
                self.ua_rotate_limit_usage = random.randint(
                    self.ua_rotate_min_usage, self.ua_rotate_max_usage)

                self.user_agent = next(self.user_agents)

            self.ua_items_scraped += 1
            return self.user_agent
        else:
            return None

    def _apply_delay(self):
        """
        Wait for the configured amount of time, previously calculated by the
        _generate_next_delay method.
        """

        last_req = self.time_last_request
        elapsed = None
        if last_req is None:
            elapsed = self.current_delay
        else:
            elapsed = time.perf_counter() - self.time_last_request

        if self.time_last_request is None or elapsed < self.current_delay:
            # Wait for the remaining time
            remaining = self.current_delay - elapsed
            time.sleep(remaining)

    def _generate_headers(self, headers: Dict[str, Any] = {}):
        """
        Generate the headers for the next request, with the correct user-agent
        value.

        :param headers: Dictionary of extra values to be included in the header

        :returns: The headers for the next request
        """

        user_agent = self._get_current_user_agent()

        if self.ua_rotate and user_agent is not None:
            headers['User-Agent'] = user_agent

        if not bool(headers):
            headers = None

        return headers

    def _generate_proxies(self, proxies: Dict[str, Any] = {}):
        """
        Generate the proxies for the next request, considering the given list
        or the Tor configuration, if supplied.

        :param proxies: Dictionary of possible default values for the proxies

        :returns: The proxies to be used by the next request
        """
        if self.ip_rotate:
            if self.ip_rotation_type == 'tor':
                if self.ip_items_scraped >= self.ip_change_after:
                    logging.info('Changing Tor IP...')
                    self.ip_items_scraped = 0

                    new_ip = self.tor_controller.get_new_ip()
                    if not new_ip:
                        raise Exception('FatalError: Failed to find a new IP')

                    logging.info(f'New Tor IP: {new_ip}')

                proxies = {'http': '127.0.0.1:8118', 'https': '127.0.0.1:8118'}

            elif self.ip_rotation_type == 'proxy':
                proxy_len = len(self.proxy_list)
                proxies = {
                    'http': self.proxy_list[self.ip_items_scraped % proxy_len],
                    'https': self.proxy_list[self.ip_items_scraped % proxy_len]
                }
            self.ip_items_scraped += 1

        return proxies if bool(proxies) else None

    def _generate_cookies(self, cookies: Dict[str, Any] = {}):
        """
        Generate the cookies for the next request.

        :param cookies: Dictionary of extra cookies to be included

        :returns: The cookies to be sent by the next request
        """
        if self.insert_cookies:
            for x in self.cookies:
                cookies = {**cookies, **x}

        return cookies if bool(cookies) else None

    def _send_request(self, req_function: Callable, *args, **kwargs) -> Any:
        """
        Apply all configured anti-blocking mechanisms and call the request
        function supplied.

        :param req_function: The function to be called to actually send the
                             request. It should take at least three named
                             arguments: headers, proxies and cookies, which
                             represent the respective values to be inserted.
                             Any extra values passed to this method are
                             redirected to the req_function.

        :returns: The response received from the supplied function
        """

        headers = self._generate_headers(kwargs.get('headers', {}))
        if 'headers' in kwargs:
            del kwargs['headers']

        proxies = self._generate_proxies(kwargs.get('proxies', {}))
        if 'proxies' in kwargs:
            del kwargs['proxies']

        cookies = self._generate_cookies(kwargs.get('cookies', {}))
        if 'cookies' in kwargs:
            del kwargs['cookies']

        self._apply_delay()

        response = req_function(headers=headers,
                                proxies=proxies,
                                cookies=cookies,
                                *args,
                                **kwargs)

        # Calculate next delay value
        self._generate_next_delay(response.elapsed.total_seconds(),
                                  response.status_code)

        self.time_last_request = time.perf_counter()

        return response
Ejemplo n.º 14
0
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


# A Tor IP will be reused only after 10 different IPs were used.
ip_changer = TorIpChanger(tor_password='******',
                          tor_port=9051,
                          reuse_threshold=300)


class ProxyMiddleware(object):
    def process_request(self, request, spider):
        ip_changer.get_new_ip()
        request.meta['proxy'] = 'http://127.0.0.1:8118'
        spider.log('Proxy : %s' % request.meta['proxy'])
Ejemplo n.º 15
0
import time
from toripchanger import TorIpChanger

ip_changer = TorIpChanger(tor_password='******',
                          tor_port=9051,
                          local_http_proxy='127.0.0.1:8118')

for i in range(10):
    new_ip = ip_changer.get_new_ip()

    print("New IP: " + new_ip)

    time.sleep(5)