def run_getter(self, cycle=GETTER_CYCLE): """定时获取cookie""" getter = Getter() while True: logger.info('开始抓取cookies') getter.run() time.sleep(cycle)
def run_tester(self, cycle=TESTER_CYCLE): """定时检测cookie可用情况""" tester = Tester() while True: logger.info('开始检查') tester.run() time.sleep(cycle)
async def test_one_proxy(self, key, proxy): """对目标网站测试一个cookies是否可用""" conn = aiohttp.TCPConnector(ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: # 解码为字符串 headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": proxy[:-1], "Host": "www.tianyancha.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36" } # async with session.get(TEST_URL, headers=headers, timeout=30) as response: try: response = requests.get(TEST_URL, headers=headers, timeout=30) result = response.text html = etree.HTML(result) # print("".join(html.xpath('//div[@class="box -company-box "]/div[@class="content"]/div[@class="header"]/h1[@class="name"]/text()'))) user = "".join( html.xpath('//span[@class="ni-sp-name"]//text()')) """"".join(html.xpath('//div[@class="box -company-box "]/div[@class="content"]/div[@class="header"]/h1[@class="name"]/text()'))""" if response.status_code in TRUE_STATUS_CODE and user: # cookie可用 self.redis.max(key, proxy) logger.info(f'用户可用 - {user}') else: # cookie不可用 # send = Send_Click() # staus = send.run(proxy) # if staus: # self.redis.max(key, proxy) # print(key, 100, "通过点字验证") # else: self.redis.decrease(key, proxy) logger.warning(f'{key} 账号, 状态吗错误') except Exception as e: logger.error(f'{key} 账号, 请求错误 - {e}') except Exception as e: # self.redis.decrease(key, proxy) logger.error(f'{key} 账号, 测试错误 - {e}')
def run(self): logger.info('cookies池开始运行') if TESTER_ENABLE: tester_process = Process(target=self.run_tester) tester_process.start() if GETTER_ENABLE: getter_process = Process(target=self.run_getter) getter_process.start() if API_ENABLE: api_process = Process(target=self.run_api) api_process.start()
def run(self): # 爬接口 如果是正常网页 title不会是 天眼查验证 resp = self.download(self.url) title = self.verify(resp.text) logger.info(f'判断网页名称 - {title}') html = etree.HTML(resp.text) user = html.xpath('//span[@class="ni-sp-name"]') logger.info('页面正常') if user: # if user and title != '天眼查校验': return 200 # 继续操作 elif title == '天眼查校验': # 如果是点触验证码 # 调用验证 接打码平台 返回坐标 [{"x":72,"y":66},{"x":97,"y":32}] 坐标类型list 里面每个字符组成一个字典x,y 依次顺序 if self.verify_image() == 'ok': # 可以继续爬这个接口 url response = self.download(self.url) # 验证成功后可以继续操作 html = etree.HTML(response.text) result = html.xpath('//span[@class="ni-sp-name"]') # //span[@class="ni-sp-name"] # print(result, '='*10) if result: logger.info(f'验证成功 - {response.status_code}') return response.status_code else: return 503 else: # 没验证成功 继续验证 # self.run() self.email.run('Login verification failed, such as continuous occurrence of the representative Super Eagle is not enough!') return 503 else: return 503
def slice(self, targetImage, bgImage): """ 拼接图片验证码 :param targetImage: 验证图片 点击顺序字符 :param bgImage: 验证图片 字符 :return: """ # 打开文件二进制流图片bytes数据 img = Image.open(BytesIO(base64.urlsafe_b64decode(targetImage))) img2 = Image.open(BytesIO(base64.urlsafe_b64decode(bgImage))) # new_image 是拼接好的图片 new_image = Image.new('RGB', (320, 130), 'red') new_image.paste(img, (0, 0)) new_image.paste(img2, (0, 30)) # new_image.show() # new_image.save('captcha.jpg') chaojiying = Chaojiying_Client("L54555", "Li891004", '90004') # 用户中心>>软件ID 生成一个替换 96001 # im = open('a.jpg', 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要// bytes_image = BytesIO() new_image.save(bytes_image, format='PNG') new_image = bytes_image.getvalue() dict_data = chaojiying.PostPic(new_image, 9004) # 1902 验证码类型 官方网站>>价格体系 3.4+版 print 后要加() pic_str = dict_data.get('pic_str').split('|') logger.info(f'坐标信息 - {pic_str}') lis = [] if pic_str[0]: [lis.append({'x': int(data.split(',')[0]), 'y':int(data.split(',')[1])-30}) for data in pic_str] # ===============模拟打码平台================= # for _ in range(4): # x = int(input('请输入坐标x:')) # if x == 0: # break # y = int(input('请输入坐标y:')) # lis.append({'x': x, 'y': y}) return lis
def verify_image(self): # 获取图片验证码返回的图片 b64串 # dt = str(int(datetime.now().timestamp() * 1000)) url = "http://antirobot.tianyancha.com/captcha/getCaptcha.json?t={}&_={}".format(str(int(datetime.now().timestamp() * 1000)), str(int(datetime.now().timestamp() * 1000) - 100)) result = self.download(url) # 获取数据 data = result.json().get('data') targetImage = data.get('targetImage') # 拿到要顺序点击的字符 bgImage = data.get('bgImage') # 拿到字符图片 captchaId = data.get('id') # 拿到图片id # 拼接图片 函数里面接入打码平台 lis = self.slice(targetImage, bgImage) # 拼接参数 发送验证请求 params = { 'captchaId': captchaId, # 图片唯一id 'clickLocs': json.dumps(lis), # 图片坐标 't': str(int(datetime.now().timestamp() * 1000)), # 当前时间戳 } # 验证成功 resp = self.download("http://antirobot.tianyancha.com/captcha/checkCaptcha.json", params=params) logger.info(f'验证结果 - {resp.json()}') return resp.json().get('state')
def run(self): """开始抓取cookies存入数据库""" accounts_usernames = self.accounts_db.usernames() keys = self.redis.get() for username in accounts_usernames[:]: if not username in keys: password = self.accounts_db.get_value(username) logger.info(f'正在生成Cookies - 账号 {username} - 密码 {password}') if not self.is_over_threshold(): try: time.sleep(5) cookie = self.crawler.crawl_main(username, password) if cookie: self.redis.add(username, cookie) logger.info(f"账号 {username} cookie有效") else: logger.info("监控到cookie为空, 登录失败") except Exception as e: logger.warning(f'请求出错 - {e}') else: # print('账号', username, "存在于cookie池里") pass
from run import Run from util.configtion import logger if __name__ == '__main__': logger.info('开始运行') start = Run() start.run()