def SaveProxy(self,proxies): """在写入数据时注意一次写入多条,一并存储,避免一次1条严重影响效率""" # print("111") for proxy in proxies: try: MysqlClient().add(proxy) except Exception: continue print("------------------") self.proxies = [] return MysqlClient().all()
def start(): '''非协程启动爬虫''' rds = RedisClient('url', '127.0.0.1', None) my = MysqlClient() ip_pv = GetIpPv(rds, my) while ip_pv.get_num(): domain = ip_pv.get_domain() print(ip_pv.get_result(domain))
def start_coro(): '''非协程启动爬虫''' rds = RedisClient('url', '127.0.0.1', None) my = MysqlClient() ip_pv = GetIpPv(rds, my) event_loop = asyncio.get_event_loop() try: event_loop.run_until_complete(ip_pv.download()) finally: event_loop.close()
class Tester(): def __init__(self): self.VALID_STATUS_CODES = [200] self.TEST_URL = "https://www.baidu.com" self.BATCH_TEST_SIZE = 100 self.mysql = MysqlClient() async def single_proxy_handler(self,proxy): """单个代理获取测试""" conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy,bytes): proxy = proxy.decode("utf-8") real_proxy = 'http://' + proxy print("正在测试",proxy) async with session.get(self.TEST_URL,proxy=real_proxy,timeout=15) as response: if response.status in self.VALID_STATUS_CODES: self.mysql.max(proxy) print("代理可用",proxy) else: self.mysql.decrease(proxy) print("请求响应码不合法",proxy) except Exception: self.mysql.decrease(proxy) print("代理请求失败", proxy) def run(self): try: proxy_list = self.mysql.all() loop = asyncio.get_event_loop() # 构建事件循环 for i in range(0,len(proxy_list),100): test_proxies = proxy_list[i:i+self.BATCH_TEST_SIZE] tasks = [self.single_proxy_handler(proxy[0]) for proxy in test_proxies] # 构建任务列表 loop.run_until_complete(asyncio.wait(tasks)) # 将任务列表注入事件循环 time.sleep(5) except Exception: print("测试错误")
def get_domain(self): '''获取域名''' return self.redis_db.pop() def get_rest_domain_num(self): '''剩余域名数量''' return self.redis_db.get_num() def save(self, text): '''保存结果''' with open('title.txt', 'a+') as f: f.write(text) def download(self): while self.get_rest_domain_num(): url = self.get_domain() logging.info('req ' + url) try: response = self.get_page(url) response.encoding = response.apparent_encoding logging.info(response.status_code) doc = self.parse(response) self.save(url + ';' + doc + '\n') except Exception as e: self.save(url + ';\n') if __name__ == '__main__': tc = ThemeCrawler(RedisClient('url', '127.0.0.1', None), MysqlClient()) tc.download()
def get_proxy(self): result = MysqlClient.random() return result
def Count(self): return MysqlClient().count()
def get_conn(): if not hasattr(g, 'proxiespool'): g.mysql = MysqlClient() return g.mysql
def __init__(self): self.VALID_STATUS_CODES = [200] self.TEST_URL = "https://www.baidu.com" self.BATCH_TEST_SIZE = 100 self.mysql = MysqlClient()