class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """判断是否达到了代理池限制""" if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run_all(self): print(' 获取器开始执行 ') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy) def run_specific(self, callback): print(' 获取器开始执行 ') if not self.is_over_threshold(): proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): self.db = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 是否超出代理池限制 """ if self.db.count() > MAX_POOL_COUNT: return True else: return False def run(self): """ :return: """ print('start to get proxy') if not self.is_over_threshold(): for item in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[item] proxies = self.crawler.get_proxy(callback) for proxy in proxies: self.db.add(proxy)
def schedule_getter(self, cycle=GETTER_CYCLE): """定时获取代理""" getter = Getter() db = RedisClient() while True: print(' 开始抓取代理 ') getter.run_specific('crawl_xdaili') db.clear() time.sleep(cycle)
def __init__(self, test_url=url): self.test_url = test_url self.db = RedisClient() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/84.0.4147.135 Safari/537.36 Edg/84.0.522.63', 'Origin': 'https://www.bilibili.com', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6' }
def start(): '''非协程启动爬虫''' rds = RedisClient('url', '127.0.0.1', None) my = MysqlClient() ip_pv = GetIpPv(rds, my) while ip_pv.get_num(): domain = ip_pv.get_domain() print(ip_pv.get_result(domain))
def start_coro(): '''非协程启动爬虫''' rds = RedisClient('url', '127.0.0.1', None) my = MysqlClient() ip_pv = GetIpPv(rds, my) event_loop = asyncio.get_event_loop() try: event_loop.run_until_complete(ip_pv.download()) finally: event_loop.close()
def run(self): accounts = RedisClient('accounts', self.website) cookies = RedisClient('cookies', self.website) accounts_usernames = accounts.all_keys() cookies_usernames = cookies.all_keys() for username in accounts_usernames: if username not in cookies_usernames: password = accounts.get(username) print('正在生成Cookies, 账号:{} 密码: {}'.format(username, password)) new = BilibiliCookies(username, password) new_cookies = new.cookie() if new_cookies: print('生成Cookies成功') else: print('生成Cookies失败') cookies.set(username, new_cookies)
def init_module(): global detector global booter global redis_client redis_client = RedisClient(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, s_key=REDIS_KEY, num=GET_NUM) xc = XiCiProxyHelper(quantity=QUANTITY, threshold=THRESHOLD) detector = Detector(redis_client, test_url=TEST_URL) booter = Booter(redis_client, xc, capacity=CAPACITY)
class ValidTester(object): def __init__(self, website='default'): self.website = website self.cookies_db = RedisClient('cookies', self.website) self.accounts_db = RedisClient('accounts', self.website) def test(self, username, cookies): raise NotImplementedError def run(self): cookies_groups = self.cookies_db.all() for username, cookies in cookies_groups.item(): self.test(username, cookies)
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy: 单个代理 :return: None """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print(' 正在测试 ', proxy) async with session.get(TEST_URL, proxy=real_proxy, headers=HEADERS, timeout=15) as response: code = response.json().get('code') print(response.json()) if response.status in VALID_STATUS_CODES and code in VALID_STATUS_CODES: self.redis.max(proxy) print(' 代理可用 ', proxy) else: self.redis.decrease(proxy) print(' 请求响应码不合法, 不可用 ', proxy) except (ClientError, ClientConnectorError, TimeoutError, AttributeError): self.redis.decrease(proxy) print(' 代理请求失败 ', proxy) def run(self): """ 测试主函数 :return: None """ print(' 测试器开始运行 ') try: proxies = self.redis.all() loop = asyncio.get_event_loop() # 批量测试 for i in range(0, len(proxies), BATCH_TEST_SIZE): test_proxies = proxies[i:i + BATCH_TEST_SIZE] tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) time.sleep(5) except Exception as e: print(' 测试器发生错误 ', e.args)
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy:代理 :return: None """ conn = aiohttp.TCPConnector(ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('正在测试', proxy) async with session.get(TEST_URL, proxy=real_proxy, timeout=15) as resonse: if resonse.status in VALID_STATUS_CODES: self.redis.max(proxy) print('代理可用', proxy) else: self.redis.decrease(proxy) print('请求响应码不合法', proxy) except (ClientError, aiohttp.ClientProxyConnectionError, asyncio.TimeoutError, AttributeError): self.redis.decrease(proxy) print('代理请求失败', proxy) def run(self): """ 测试主函数 :return: None """ print('检测器开始运行') try: proxies = self.redis.all() asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) loop = asyncio.get_event_loop() #批量测试 for i in range(0, len(proxies), BATCH_TEST_SIZE): test_proxies = proxies[i:i + BATCH_TEST_SIZE] tasks = [ asyncio.ensure_future(self.test_single_proxy(proxy)) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) time.sleep(5) except Exception as e: print("检测器发生错误", e)
class Tester(): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self,proxy): conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy,bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://'+proxy print('正在测试',real_proxy) async with session.get(TEST_API,proxy=real_proxy,timeout=15,allow_redirects=False) as response: if response.status in VALID_STATUS_CODE: self.redis.enable(proxy) print('代理 ',proxy,' 可用') else: self.redis.decrease(proxy) print('代理 ',proxy,' 请求失败') except: self.redis.decrease(proxy) print('代理 ' ,proxy, ' 不可用') def run(self): print('开始测试') try: for i in range(0,self.redis.count(),BATCH_TEST_COUNT): start = i end = min(i+BATCH_TEST_COUNT,self.redis.count()) proxies = self.redis.batch(start,end) print('正在测试第 ',start,'到',end,'个代理') tasks = [self.test_single_proxy(proxy) for proxy in proxies] loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) except Exception as e: print('测试错误 ', e.args)
class Tester(): def __init__(self, test_url=url): self.test_url = test_url self.db = RedisClient() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/84.0.4147.135 Safari/537.36 Edg/84.0.522.63', 'Origin': 'https://www.bilibili.com', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6' } def test_single_proxy(self, proxy): proxies = {'http': 'http://' + proxy, 'https': 'https://' + proxy} try: response = requests.get(self.test_url, headers=self.headers, proxies=proxies, timeout=5) json = response.json() code = json.get('data') if code != None: self.db.max(proxy) print(' 代理可用 ', proxy) else: print(' 请求响应码不合法, 不可用 ', proxy) self.db.decrease(proxy) except: print(' 代理请求失败 ', proxy) self.db.decrease(proxy) def run(self): print(' 测试器开始运行 ') proxies = self.db.all() for proxy in proxies: print(' 正在测试 ', proxy) self.test_single_proxy(proxy)
'Please input config file path(if you use default file type \'d\'.): ') if path == 'd': path = 'proxy.conf' sure = input( 'Are you sure the config file in \'{}\'. [y/n]: '.format(path)) if sure == 'y': break cfg = ConfigParser() cfg.read(path) REDIS_HOST = try_to_get_options(cfg.get, 'redis', 'host') REDIS_PORT = try_to_get_options(cfg.getint, 'redis', 'port') REDIS_PASSWORD = try_to_get_options(cfg.get, 'redis', 'password') REDIS_KEY = try_to_get_options(cfg.get, 'redis', 'key') redis_client = RedisClient(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, s_key=REDIS_KEY) count = redis_client.count() if count == 0: print('Already cleaning!') else: redis_client.show() sure = input( 'Are you sure remove that data? amount {} items! [y/n]: '.format( count)) if sure == 'y': redis_client.remove_by_range(0, 100) else: print('Good luck! Bye Bye')
from crawl import XiCiProxyHelper from storage import Booter, RedisClient from detector import Detector if __name__ == "__main__": xch = XiCiProxyHelper(quantity=40, threshold=1.000) rc = RedisClient() b = Booter(rc, xch) b.run() de = Detector(rc) de.run() print('一共有{}'.format(rc.count())) rc.show() rc.remove_by_range(0, 100)
def get_domain(self): '''获取域名''' return self.redis_db.pop() def get_rest_domain_num(self): '''剩余域名数量''' return self.redis_db.get_num() def save(self, text): '''保存结果''' with open('title.txt', 'a+') as f: f.write(text) def download(self): while self.get_rest_domain_num(): url = self.get_domain() logging.info('req ' + url) try: response = self.get_page(url) response.encoding = response.apparent_encoding logging.info(response.status_code) doc = self.parse(response) self.save(url + ';' + doc + '\n') except Exception as e: self.save(url + ';\n') if __name__ == '__main__': tc = ThemeCrawler(RedisClient('url', '127.0.0.1', None), MysqlClient()) tc.download()
def __init__(self, website='default'): self.website = website self.cookies_db = RedisClient('cookies', self.website) self.accounts_db = RedisClient('accounts', self.website)
def conn(): if not hasattr(g, 'conn'): g.conn = RedisClient() return g.conn
def get_coon(): if not hasattr(g, 'redis'): g.redis = RedisClient() return g.redis
from storage import RedisClient rc = RedisClient('url', '127.0.0.1', None) with open('ul.txt', 'r') as f: n = [line.rstrip() for line in f] # print(n) for x in n: rc.add(x)
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()
def __init__(self): self.redis = RedisClient()