Esempio n. 1
0
 def __init__(self, crawler):
     self._bans = defaultdict(int)
     self.crawler = crawler
     self._saved_delays = defaultdict(lambda: None)
     proxy_url = self.crawler.settings.get('PROXY_URL')
     self.proxy_type, self.user, self.password, self.hostport = _parse_proxy(
         proxy_url)
Esempio n. 2
0
 def setUp(self):
     conf = namedtuple('ProxyConf', 'scheme username password hostport')
     self.config = {
         'https':
         conf(*_parse_proxy('https://*****:*****@host:3128')),
         'no_proxy': 'localhost,127.0.0.1,dev_server:8080'
     }
Esempio n. 3
0
def parse_proxy(proxy_ip: str) -> str:
    """解析代理,获取代理的scheme
       `proxy_ip`: 代理ip
    """

    proxy_type, *_ = _parse_proxy(proxy_ip)
    return proxy_type
Esempio n. 4
0
def get_proxy(auth_encoding, url: str, orig_type: str) -> Tuple[bytes, str]:
    proxy_type, user, password, host_port = _parse_proxy(url)
    proxy_url: str = urlunparse((proxy_type
                                 or orig_type, host_port, '', '', '', ''))
    credentials: bytes = (basic_auth_header(user, password, auth_encoding)
                          if user else None)
    return credentials, proxy_url
Esempio n. 5
0
def check_proxy(proxy_ip: str, proxy_user_name: str, proxy_pwd: str) -> bool:
    """检查代理是否可用,request同步,适用于单个代理
       `proxy_ip`: 代理IP
       `proxy_user_name`: 用户名
       `proxy_pwd`: 密码
    """

    try:
        url = "http://httpbin.org/ip"
        headers = json.loads(
            get_scrapy_settings("DEFAULT_REQUEST_HEADERS").replace(r"'", '"'))
        proxy_type, username, password, proxy_port = _parse_proxy(proxy_ip)
        if not username and not password:
            proxy = {proxy_type: proxy_ip}  # 用户名和密码都已经包含在了proxy_ip中
        else:
            proxy = {
                proxy_type:
                f"{proxy_type}://{proxy_user_name}:{proxy_pwd}@{proxy_port}"
            }  # 重新组proxy
        response = requests.get(url, headers=headers, proxies=proxy).text
    except (requests.exceptions.ProxyError,
            requests.exceptions.ConnectTimeout):
        return False
    else:
        if json.loads(response).get("origin") == re.search(
                r"(.*):", proxy_port).group(1):
            return True
        else:
            return False
Esempio n. 6
0
    def _get_proxy(self, url, orig_type=''):
        proxy_type, user, password, hostport = _parse_proxy(url)
        proxy_url = urlunparse((proxy_type
                                or orig_type, hostport, '', '', '', ''))

        creds = self._basic_auth_header(user, password) if user else None

        return creds, proxy_url
Esempio n. 7
0
    def test_set_tunnel_is_not_called_when_socks(self, mock_set_tunnel):
        conf = namedtuple('ProxyConf', 'scheme username password hostport')
        self.config = {
            'https': conf(*_parse_proxy('socks5://username:password@host:3128'))
        }
        ProxyAwareHTTPSConnection(self.config, 'example.com')

        self.assertEqual(mock_set_tunnel.call_count, 0)
Esempio n. 8
0
 def _get_proxy(self, url, orig_type):
     proxy_type, user, password, hostport = _parse_proxy(url)
     proxy_url = urlunparse((proxy_type
                             or orig_type, hostport, '', '', '', ''))
     if user:
         print('_get_proxy', user)
     else:
         creds = None
     return creds, proxy_url
Esempio n. 9
0
    def test_raises_exception_when_invalid_socks_scheme(self, mock_socks):
        conf = namedtuple('ProxyConf', 'scheme username password hostport')
        self.config = {
            'https': conf(*_parse_proxy('socks6://socks_user:socks_pass@socks_host:3128')),
            'no_proxy': 'localhost,127.0.0.1,dev_server:8080'
        }

        conn = ProxyAwareHTTPSConnection(self.config, 'example.com', context=Mock())

        with self.assertRaises(TypeError):
            conn.connect()
Esempio n. 10
0
    def _sanitise_proxy_config(self, proxy_config):
        """Parse the proxy configuration into something more usable."""
        conf = namedtuple('ProxyConf', 'scheme username password hostport')

        for proxy_type in ('http', 'https'):
            # Parse the upstream proxy URL into (scheme, username, password, hostport)
            # for ease of access.
            if proxy_config.get(proxy_type) is not None:
                proxy_config[proxy_type] = conf(
                    *_parse_proxy(proxy_config[proxy_type]))

        return proxy_config
Esempio n. 11
0
    def _sanitise_proxy_config(self, proxy_config):
        """Parse the proxy configuration into something more usable."""
        for proxy_type in ('http', 'https'):
            # Parse the upstream proxy URL into (scheme, user, password, hostport)
            # for ease of access.
            if proxy_config.get(proxy_type) is not None:
                proxy_config[proxy_type] = _parse_proxy(
                    proxy_config[proxy_type])

        if proxy_config:
            proxy_config['no_proxy'] = [host.strip() for host in proxy_config.get('no_proxy', '').split(',')
                                        if host]

        return proxy_config
Esempio n. 12
0
 def process_response(self, request, response, spider):
     cur_proxy = request.meta['proxy']
     if response.status >= 400:
         self.stats[cur_proxy] += 1
         logger.info('%s failed %s' % (cur_proxy, self.stats[cur_proxy]))
     if self.stats[cur_proxy] >= self.max_failed:
         for proxy in self.proxies:
             *_, hostport = _parse_proxy(proxy)
             if cur_proxy.endswith(hostport):
                 self.proxies.remove(proxy)
                 logger.warning('proxy %s removed from proxies.' %
                                cur_proxy)
                 break
     return response
Esempio n. 13
0
    def process_response(self, request, response, spider):
        cur_proxy = request.meta['proxy']
        print(cur_proxy)
        if response.status >= 400:
            self.stats[cur_proxy] += 1

        if self.stats[cur_proxy] >= self.max_failed:
            for proxy in self.proxies:
                *_, hostport = _parse_proxy(proxy)
                if cur_proxy.endswith(hostport):
                    self.proxies.remove(proxy)
                    logger.info('proxy {ip} remove from proxy list.'.format(ip=cur_proxy))
                    break

        return response
Esempio n. 14
0
def get_upstream_proxy(options):
    """Get the upstream proxy configuration from the options dictionary.
    This will be overridden with any configuration found in the environment
    variables HTTP_PROXY, HTTPS_PROXY, NO_PROXY

    The configuration will be returned as a dictionary with keys 'http',
    'https' and 'no_proxy'. The value of the 'http' and 'https' keys will
    be a named tuple with the attributes:
        scheme, username, password, hostport
    The value of 'no_proxy' will be a list.

    Note that the keys will only be present in the dictionary when relevant
    proxy configuration exists.

    Args:
        options: The selenium wire options.
    Returns: A dictionary.
    """
    proxy_options = (options or {}).pop('proxy', {})

    http_proxy = os.environ.get('HTTP_PROXY')
    https_proxy = os.environ.get('HTTPS_PROXY')
    no_proxy = os.environ.get('NO_PROXY')

    merged = {}

    if http_proxy:
        merged['http'] = http_proxy
    if https_proxy:
        merged['https'] = https_proxy
    if no_proxy:
        merged['no_proxy'] = no_proxy

    merged.update(proxy_options)

    no_proxy = merged.get('no_proxy')
    if isinstance(no_proxy, str):
        merged['no_proxy'] = [h.strip() for h in no_proxy.split(',')]

    conf = namedtuple('ProxyConf', 'scheme username password hostport')

    for proxy_type in ('http', 'https'):
        # Parse the upstream proxy URL into (scheme, username, password, hostport)
        # for ease of access.
        if merged.get(proxy_type) is not None:
            merged[proxy_type] = conf(*_parse_proxy(merged[proxy_type]))

    return merged
Esempio n. 15
0
 def process_response(self, request, response, spider):
     # 获取当次请求所用的代理ip
     cur_proxy = request.meta['proxy']
     logger.info(cur_proxy)
     if response.status >= 400:
         self.stats[cur_proxy] += 1
         logger.info('%s proxy request has failed' % cur_proxy)
     if self.stats[cur_proxy] >= self.max_failed_times:
         for proxy in self.proxies:
             *_, hostport = _parse_proxy(proxy)
             if cur_proxy.endswith(hostport):
                 self.proxies.remove(cur_proxy)
                 logger.warning(
                     '%s proxy has beyond max_failed_times,removed' %
                     cur_proxy)
                 break
     return response
Esempio n. 16
0
    def test_connect_uses_remote_dns(self, mock_socks):
        conf = namedtuple('ProxyConf', 'scheme username password hostport')
        self.config = {
            'http':
            conf(*_parse_proxy(
                'socks5h://socks_user:socks_pass@socks_host:3128')),
            'no_proxy':
            'localhost,127.0.0.1,dev_server:8080'
        }
        mock_socks.PROXY_TYPE_SOCKS5 = socks.PROXY_TYPE_SOCKS5

        conn = ProxyAwareHTTPConnection(self.config, 'example.com')
        conn.connect()

        mock_socks.create_connection.assert_called_once_with(
            ('example.com', 80), socket._GLOBAL_DEFAULT_TIMEOUT, None,
            socks.PROXY_TYPE_SOCKS5, 'socks_host', 3128, True, 'socks_user',
            'socks_pass', ((socket.IPPROTO_TCP, socket.TCP_NODELAY, 1), ))
Esempio n. 17
0
def extract_proxy_hostport(proxy):
    """
    Return the hostport component from a given proxy:

    >>> extract_proxy_hostport('example.com')
    'example.com'
    >>> extract_proxy_hostport('http://www.example.com')
    'www.example.com'
    >>> extract_proxy_hostport('127.0.0.1:8000')
    '127.0.0.1:8000'
    >>> extract_proxy_hostport('127.0.0.1')
    '127.0.0.1'
    >>> extract_proxy_hostport('localhost')
    'localhost'
    >>> extract_proxy_hostport('zot:4321')
    'zot:4321'
    >>> extract_proxy_hostport('http://*****:*****@baz:1234')
    'baz:1234'
    """
    return _parse_proxy(proxy)[3]
Esempio n. 18
0
    def __init__(self, proxy_url=None):

        options = webdriver.ChromeOptions()
        prefs = {'profile.default_content_setting_values': {'images': 2}}
        options.add_experimental_option('prefs', prefs)

        if proxy_url:
            proxy_type, user, password, hostport = _parse_proxy(proxy_url)
            proxyauth_plugin_path = create_proxyauth_extension(
                proxy_host=hostport.split(':')[0],
                proxy_port=hostport.split(':')[1],
                proxy_username=user,
                proxy_password=password,
                scheme=proxy_type,
                plugin_path="vimm_chrome_proxyauth_plugin.zip")
            options.add_extension(proxyauth_plugin_path)

        self.display = self.get_display()

        self.driver = webdriver.Chrome(chrome_options=options)
        self.driver.set_page_load_timeout(30)
Esempio n. 19
0
 def setUp(self):
     self.config = {
         'http': _parse_proxy('http://*****:*****@host:3128'),
         'https': _parse_proxy('https://*****:*****@host:3128'),
         'no_proxy': 'localhost,127.0.0.1,dev_server:8080'
     }
Esempio n. 20
0
def parse_proxy(proxy):
    try:
        return parse_proxy_cache[proxy]
    except KeyError:
        parse_proxy_cache[proxy] = proxy_tuple = request._parse_proxy(proxy)
        return proxy_tuple
Esempio n. 21
0
def _parse(proxy_url):
    proxy_type, user, password, hostport = _parse_proxy(proxy_url)
    return '%s://%s' % (proxy_type, hostport)
Esempio n. 22
0
def reform_url(url):
    # 将url解开成不同的部分
    proxy_type, *_, hostport = _parse_proxy(url)
    # 将代理URL重新组合,去掉用户名密码
    return '%s://%s' % (proxy_type, hostport)
Esempio n. 23
0
def reform_url(url):
    proxy_type, user, password, hostport = _parse_proxy(url)
    return '%s://%s' % (proxy_type, hostport)