コード例 #1
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            print('已达到了代理池限制,暂停爬取')
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_index in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_index]
                # 获取代理
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    self.redis.add(proxy)
コード例 #2
0
def crawl_xici_proxy(pagenumber):
    url = "https://www.xicidaili.com/nn/{}"
    client = RedisClient()  #设置数据库为redis
    for i in range(1, pagenumber + 1):  #循环获取要访问的网址
        new_url = url.format(i)
        htmlcont = get_html(new_url, 'UTF-8')
        selector = html.fromstring(htmlcont)  #用selector获取网页源码
        iplist = selector.xpath("//tr/td[2]/text()")  #正则表达式获取其中的ip
        portlist = selector.xpath("//tr/td[3]/text()")

        for item in zip(iplist[1:-1], portlist[1:-1]):  #合并ip和port
            client.add(":".join(item))
コード例 #3
0
class ValidityTester(object):
    test_api = TEST_API

    def __init__(self):
        self._raw_proxies = None
        self._usable_proxies = []

    def set_raw_proxies(self, proxies):
        self._raw_proxies = proxies
        self._conn = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        text one proxy, if valid, put them to usable_proxies.
        """
        try:
            async with aiohttp.ClientSession() as session:
                try:
                    if isinstance(proxy, bytes):
                        proxy = proxy.decode('utf-8')
                    real_proxy = 'http://' + proxy
                    print('Testing', proxy)
                    async with session.get(
                            self.test_api,
                            proxy=real_proxy,
                            timeout=get_proxy_timeout) as response:
                        if response.status == 200:
                            self._conn.put(proxy)
                            print('Valid proxy', proxy)
                except (ProxyConnectionError, TimeoutError, ValueError):
                    print('Invalid proxy', proxy)
        except (ServerDisconnectedError, ClientResponseError,
                ClientConnectorError) as s:
            print(s)
            pass

    def test(self):
        """
        aio test all proxies.
        """
        print('ValidityTester is working')
        try:
            loop = asyncio.get_event_loop()
            tasks = [
                self.test_single_proxy(proxy) for proxy in self._raw_proxies
            ]
            loop.run_until_complete(asyncio.wait(tasks))
        except ValueError:
            print('Async Error')
コード例 #4
0
 def valid_proxy(cycle=VALID_CHECK_CYCLE):
     """
     Get half of proxies which in redis
     """
     conn = RedisClient()
     tester = ValidityTester()
     while True:
         print('Refreshing ip')
         count = int(0.5 * conn.queue_len)
         if count == 0:
             print('Waiting for adding')
             time.sleep(cycle)
             continue
         raw_proxies = conn.get(count)
         tester.set_raw_proxies(raw_proxies)
         tester.test()
         time.sleep(cycle)
コード例 #5
0
 def check_pool(lower_threshold=POOL_LOWER_THRESHOLD,
                upper_threshold=POOL_UPPER_THRESHOLD,
                cycle=POOL_LEN_CHECK_CYCLE):
     """
     If the number of proxies less than lower_threshold, add proxy
     """
     conn = RedisClient()
     adder = PoolAdder(upper_threshold)
     while True:
         if conn.queue_len < lower_threshold:
             adder.add_to_queue()
         time.sleep(cycle)
コード例 #6
0
ファイル: tester.py プロジェクト: renwenduan/TBspider
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试', proxy)
                async with session.get(TEST_URL, proxy=real_proxy, timeout=5, allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print('代理可用', proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法 ', response.status, 'IP', proxy)
            except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)

    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [self.test_single_proxy(proxy) for proxy in test_proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
コード例 #7
0
ファイル: tester.py プロジェクト: renwenduan/TBspider
 def __init__(self):
     self.redis = RedisClient()
コード例 #8
0
ファイル: importer.py プロジェクト: renwenduan/TBspider
from proxy_pool.db import RedisClient

conn = RedisClient()


def set(proxy):
    result = conn.add(proxy)
    print(proxy)
    print('录入成功' if result else '录入失败')


def scan():
    print('请输入代理, 输入exit退出读入')
    while True:
        proxy = input()
        if proxy == 'exit':
            break
        set(proxy)


if __name__ == '__main__':
    scan()
コード例 #9
0
 def __init__(self):
     self.redis = RedisClient()
     self.crawler = Crawler()
コード例 #10
0
 def __init__(self, threshold):
     self._threshold = threshold
     self._conn = RedisClient()
     self._tester = ValidityTester()
     self._crawler = FreeProxyGetter()
コード例 #11
0
 def set_raw_proxies(self, proxies):
     self._raw_proxies = proxies
     self._conn = RedisClient()
コード例 #12
0
ファイル: web_proxy.py プロジェクト: JamesW99/web_crawler
def get_conn():  #具柄:存在直接返回,不存在创建一个
    if not hasattr(g, 'redis'):
        g.redis = RedisClient()
    return g.redis