def get_conn(): """ 链接Mongodb """ if not hasattr(g, 'mongodb_conn'): g.mongodb_conn = MongodbClient() return g.mongodb_conn
def valid_proxy(self, cycle=VALID_CHECK_CYCLE): """ 从数据库中拿到2/3半代理进行检查 """ conn = MongodbClient() tester = ProxyTester() while True: print('Refreshing ip...') count = int(5 * conn.get_nums / 6) if count == 0: print('Waiting for adding...') time.sleep(cycle) continue raw_proxies = conn.get(count) tester.set_raw_proxies(raw_proxies) tester.test() time.sleep(cycle)
class ProxyTester(object): def __init__(self): self._raw_proxies = None def set_raw_proxies(self, proxies): self._raw_proxies = proxies self._conn = MongodbClient() async def test_single_proxy(self, proxy): """ 测试一个代理,如果有效,将他放入usable-proxies """ scheme = 'http://' test_url = HTTPS_TEST_URL if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = scheme + proxy async def test_proxy(https=True): name = 'https' if https else 'http' async with session.get(test_url, proxy=real_proxy, timeout=10) as response: if response.status == 200 or response.status == 429: self._conn.put(proxy, https) print('Valid {} proxy'.format(name), proxy) else: print('Invalid {} status'.format(name), response.status, proxy) self._conn.delete(proxy) try: async with aiohttp.ClientSession() as session: try: await test_proxy() except: try: test_url = HTTP_TEST_URL await test_proxy(False) except Exception as e: self._conn.delete(proxy) print('Invalid proxy', proxy, e) # print('session error', e) except Exception as e: print(e) def test(self): """ 异步测试所有代理 """ print('Tester is working...') try: loop = asyncio.get_event_loop() tasks = [ self.test_single_proxy(proxy) for proxy in self._raw_proxies ] loop.run_until_complete(asyncio.wait(tasks)) except ValueError as e: time_format = '%Y-%m-%d %H:%M:%S' format_t = time.strftime(time_format) print(format_t, 'Async Error:', e)
def check_pool(self, lower_threshold=POOL_LOWER_THRESHOLD, upper_threshold=POOL_UPPER_THRESHOLD, cycle=POOL_LEN_CHECK_CYCLE): """ 如果代理数量少于最低阈值,添加代理 """ conn = MongodbClient() adder = PoolAdder(upper_threshold) while True: if max(conn.get_count) <= lower_threshold: adder.add_to_pool() time.sleep(cycle)
def __init__(self, threshold): self._threshold = threshold self._conn = MongodbClient() self._tester = ProxyTester() self._crawler = ProxyGetter()
#!/usr/bin/python # -*- coding:utf-8 -*- # __author__ = 'yukun' import json from datetime import datetime from .utils import parse_url from db import MongodbClient conn = MongodbClient() def get_img_url(): params = { 'format': 'js', 'idx': 0, 'n': 1, 'pid': 'hp' } base_url = 'https://www.bing.com/HPImageArchive.aspx' resp = parse_url(base_url, params).text result = json.loads(resp) if result: bing_url = 'https://www.bing.com' img_url = bing_url + result['images'][0]['url'] img_copyright = result['images'][0]['copyright'] return {'url': img_url, 'copyright': img_copyright} return None
def set_raw_proxies(self, proxies): self._raw_proxies = proxies self._conn = MongodbClient()