Ejemplo n.º 1
0
class ValidTester(object):
    test_url = TEST_URL

    # 校验测试初始化
    def __init__(self):
        self.__raw_proxies = None
        self.__usable_proxies = None
        self.db = MongoOperator()

    # 设置初始需校验数据
    def set_raw_proxies(self, raw_proxies):
        self.__raw_proxies = raw_proxies
        self.__usable_proxies = []

    # 测试一个代理
    async def test_one_proxy(self, proxy):
        async with aiohttp.ClientSession() as session:
            try:
                if isinstance(proxy, bytes):  # 确保proxy是字符串数据
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + ("%s:%s" %
                                          (proxy['ip'], proxy['port']))
                print('Testing ', real_proxy)

                async with session.get(self.test_url,
                                       proxy=real_proxy,
                                       timeout=15) as response:
                    if response.status == 200:  # 使用代理连接后访问正常
                        print('Valid proxy', proxy)
                        self.__usable_proxies.append(proxy)  # 存入可用代理set
                    else:
                        print('Invalid proxy', proxy)
                        self.db.delete(proxy, filterone=True)

            except (TimeoutError, ValueError, Exception):
                print('Exception ,Invalid proxy %s' % (proxy))
                self.db.delete(proxy, filterone=True)

    # 对队列中的代理进行测试
    def test(self):
        print('ValidTester is working')
        try:
            loop = asyncio.get_event_loop()
            tasks = [
                self.test_one_proxy(proxy) for proxy in self.__raw_proxies
            ]  # 对数据迭代检测
            loop.run_until_complete(asyncio.wait(tasks))  # 直到迭代检测任务完成
        except ValueError:
            print('Async Error')

    # 获取有用代理的set
    @property
    def usable_proxies(self):
        return self.__usable_proxies
Ejemplo n.º 2
0
 def valid_proxy(cycle=VALID_CHECK_CYCLE):
     __db = MongoOperator()
     __tester = ValidTester()
     while True:
         print('正在校验代理IP')
         total = int(0.5 * __db.count)
         if total == 0:
             print('当前可用代理IP为空, 等待添加补充')
             time.sleep(cycle)
             continue
         raw_proxies = __db.select(total)  # 从redis数据库获取原始代理ip
         __tester.set_raw_proxies(raw_proxies)
         __tester.test()  # 校验原始代理ip
         time.sleep(cycle)
Ejemplo n.º 3
0
 def check_pool(lower_threshold=POOL_LOWER_THRESHOLD,
                upper_threshold=POOL_UPPER_THRESHOLD,
                cycle=POOL_LEN_CHECK_CYCLE):
     __db = MongoOperator()
     adder = PoolAdder(upper_threshold)
     while True:
         if __db.count < lower_threshold:
             adder.add_to_queue()
         time.sleep(cycle)
Ejemplo n.º 4
0
 def gets(self, max_page):
     self.start_url = XICINN_URL  # 默认使用高匿代理
     urls = [self.start_url.format(i)
             for i in range(self._counter, self._counter + max_page)]
     self._increment(max_page)
     db = MongoOperator()
     for url in urls:
         html = self.get_one_page(url)
         # print(html) debug
         # self.parse_one_page(html)
         for item in self.parse_one_page(html):
             print(item)  # 输出解析出来的单元
         #    proxy_spider.export_to_file(item) # 导出json字典
             db.insert(item, MONGO_TABLE)  # 导出到数据库中
     db.close()
Ejemplo n.º 5
0
 def __init__(self):
     self.__raw_proxies = None
     self.__usable_proxies = None
     self.db = MongoOperator()
Ejemplo n.º 6
0
def get_count():
    db = MongoOperator()
    count = db.count
    db.close()
    return str(count)
Ejemplo n.º 7
0
def get_proxy():
    db = MongoOperator()
    item = db.select(count=1)
    db.close()
    return json.dumps(item)
Ejemplo n.º 8
0
 def __init__(self, threshold):
     self.db = MongoOperator()
     self._threshold = threshold
     self.valid_tester = ValidTester()