class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: print('已达到了代理池限制,暂停爬取') return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_index in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_index] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
def crawl_xici_proxy(pagenumber): url = "https://www.xicidaili.com/nn/{}" client = RedisClient() #设置数据库为redis for i in range(1, pagenumber + 1): #循环获取要访问的网址 new_url = url.format(i) htmlcont = get_html(new_url, 'UTF-8') selector = html.fromstring(htmlcont) #用selector获取网页源码 iplist = selector.xpath("//tr/td[2]/text()") #正则表达式获取其中的ip portlist = selector.xpath("//tr/td[3]/text()") for item in zip(iplist[1:-1], portlist[1:-1]): #合并ip和port client.add(":".join(item))
class ValidityTester(object): test_api = TEST_API def __init__(self): self._raw_proxies = None self._usable_proxies = [] def set_raw_proxies(self, proxies): self._raw_proxies = proxies self._conn = RedisClient() async def test_single_proxy(self, proxy): """ text one proxy, if valid, put them to usable_proxies. """ try: async with aiohttp.ClientSession() as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('Testing', proxy) async with session.get( self.test_api, proxy=real_proxy, timeout=get_proxy_timeout) as response: if response.status == 200: self._conn.put(proxy) print('Valid proxy', proxy) except (ProxyConnectionError, TimeoutError, ValueError): print('Invalid proxy', proxy) except (ServerDisconnectedError, ClientResponseError, ClientConnectorError) as s: print(s) pass def test(self): """ aio test all proxies. """ print('ValidityTester is working') try: loop = asyncio.get_event_loop() tasks = [ self.test_single_proxy(proxy) for proxy in self._raw_proxies ] loop.run_until_complete(asyncio.wait(tasks)) except ValueError: print('Async Error')
def valid_proxy(cycle=VALID_CHECK_CYCLE): """ Get half of proxies which in redis """ conn = RedisClient() tester = ValidityTester() while True: print('Refreshing ip') count = int(0.5 * conn.queue_len) if count == 0: print('Waiting for adding') time.sleep(cycle) continue raw_proxies = conn.get(count) tester.set_raw_proxies(raw_proxies) tester.test() time.sleep(cycle)
def check_pool(lower_threshold=POOL_LOWER_THRESHOLD, upper_threshold=POOL_UPPER_THRESHOLD, cycle=POOL_LEN_CHECK_CYCLE): """ If the number of proxies less than lower_threshold, add proxy """ conn = RedisClient() adder = PoolAdder(upper_threshold) while True: if conn.queue_len < lower_threshold: adder.add_to_queue() time.sleep(cycle)
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy: :return: """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('正在测试', proxy) async with session.get(TEST_URL, proxy=real_proxy, timeout=5, allow_redirects=False) as response: if response.status in VALID_STATUS_CODES: self.redis.max(proxy) print('代理可用', proxy) else: self.redis.decrease(proxy) print('请求响应码不合法 ', response.status, 'IP', proxy) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): self.redis.decrease(proxy) print('代理请求失败', proxy) def run(self): """ 测试主函数 :return: """ print('测试器开始运行') try: count = self.redis.count() print('当前剩余', count, '个代理') for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) print('正在测试第', start + 1, '-', stop, '个代理') test_proxies = self.redis.batch(start, stop) loop = asyncio.get_event_loop() tasks = [self.test_single_proxy(proxy) for proxy in test_proxies] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) except Exception as e: print('测试器发生错误', e.args)
def __init__(self): self.redis = RedisClient()
from proxy_pool.db import RedisClient conn = RedisClient() def set(proxy): result = conn.add(proxy) print(proxy) print('录入成功' if result else '录入失败') def scan(): print('请输入代理, 输入exit退出读入') while True: proxy = input() if proxy == 'exit': break set(proxy) if __name__ == '__main__': scan()
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()
def __init__(self, threshold): self._threshold = threshold self._conn = RedisClient() self._tester = ValidityTester() self._crawler = FreeProxyGetter()
def set_raw_proxies(self, proxies): self._raw_proxies = proxies self._conn = RedisClient()
def get_conn(): #具柄:存在直接返回,不存在创建一个 if not hasattr(g, 'redis'): g.redis = RedisClient() return g.redis