class Getter: def __init__(self, website='tianyancha'): """初始化数据库类和cookie爬虫类""" self.website = website self.redis = RedisClient('accounts', self.website) self.crawler = Crawler() self.accounts_db = RedisClient('accounts', self.website) def is_over_threshold(self): """判断数据库是否已经存满""" if self.redis.count() >= POOL_UPPER_THRESHLD: return True return False def run(self): """开始抓取cookies存入数据库""" accounts_usernames = self.accounts_db.usernames() keys = self.redis.get() for username in accounts_usernames[:]: if not username in keys: password = self.accounts_db.get_value(username) print('正在生成Cookies', '账号', username, '密码', password) if not self.is_over_threshold(): try: time.sleep(5) cookie = self.crawler.crawl_main(username, password) if cookie: self.redis.add(username, cookie) print("cookie有效") else: print("监控到cookie为空") except Exception as e: pass else: print('账号', username, "存在于cookie池里")
def verify_cookie(cls): baseurl = 'https://weibo.cn/' conn = RedisClient() if conn.get(): #print(conn.get()) try: response = requests.get(baseurl,cookies=conn.get()) #print(response.text) if response.status_code == 200: return cls(cookie=conn.get()) else: conn.add_score(conn.get()) return cls(cookie=Spider.verify_cookie()) except Exception: print('verify error') else: l = Login() l.save_cookies() return cls(cookie=Spider.verify_cookie())
def verify_cookie(cls): baseurl = 'https://weibo.cn/' conn = RedisClient() if conn.get(): #print(conn.get()) try: response = requests.get(baseurl, cookies=conn.get()) #print(response.text) if response.status_code == 200: return cls(cookie=conn.get()) else: conn.add_score(conn.get()) return cls(cookie=Spider.verify_cookie()) except Exception: print('verify error') else: l = Login() l.save_cookies() return cls(cookie=Spider.verify_cookie())
class AipClient(object): ''' 百度识别api ''' def __init__(self, appid, api_key, secrrt_key, redis_url): self.appid = appid self.api_key = api_key self.secrrt_key = secrrt_key self.client = AipOcr(appid, api_key, secrrt_key) self.redis = RedisClient(redis_url) def __new__(cls, *args, **kw): ''' api 单例模式 ''' if not hasattr(cls, '_instance'): cls._instance = super().__new__(cls) return cls._instance @property def options(self): return { "language_type": "CHN_ENG", "detect_direction": "false", "detect_language": "true", "probability": "true" } def General(self, image, **kwargs): print('调取General_api 识别') return self.client.basicGeneral(image, self.options) def Accurate(self, image): print('调取Accurate_api 识别') return self.client.basicAccurate(image, self.options) def orc(self, image, **kwargs): hash_value = MD5.md5(image) results = self.General(image, **kwargs) if results.get('words_result'): self.redis.add(hash_value, results['words_result'][0]['words']) return results['words_result'][0]['words'] results = self.Accurate(image) if results.get('words_result'): self.redis.add(hash_value, results['words_result'][0]['words']) return results['words_result'][0]['words'] return '*' def run(self, image, **kwargs): hash_value = MD5.md5(image) if self.redis.exists(hash_value): return self.redis.get(hash_value) else: return self.orc(image, **kwargs)
class ImportDatabase(object): def __init__(self): self.client = RedisClient() self.kuaidai = KuaidaiProcuration() def set_ip(self,value): self.client.set(value) def main(self): for item in self.kuaidai.parse_url(): if item not in self.client.get(): self.client.set(item) print(f'获取IP个数:{self.client.count()}')
class VaildTester(object): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36' } def __init__(self): self.client = RedisClient() def test(self, value): raise NotImplementedError def run(self): items = self.client.get() for item in items: self.test(item)
def run(): for website in ACCOUNTS.keys(): print('*' * 40) print('%s正在检测cookies...' % website) acc = RedisClient('accounts', website) coo = RedisClient('cookies', website) log = eval(website.capitalize() + 'Login()') for username, password in acc.get_all().items(): # 第一次运行时,可能数据还未录入就开始测试,因此加异常处理 try: cookies = coo.get(username) if log.good_cookies(cookies): print('\t%s\t\tCookies已通过检测...' % username) else: print('\t%s\t\tCookies未通过检测!!!' % username) coo.delete(username) print('\t%s\t\tCookies已删除!!!' % username) log.login(username, password) cookies = log.get_cookies() coo.set(username, cookies) except Exception as e: print('数据库为空,请等数据录入之后再进行测试:', e.args)
class AipClient(object): ''' 百度识别api ''' def __init__(self, appid, api_key, secrrt_key, redis_url): self.appid = appid self.api_key = api_key self.secrrt_key = secrrt_key self.client = AipOcr(appid, api_key, secrrt_key) self.redis = RedisClient(redis_url) def __new__(cls, *args, **kw): ''' api 单例模式 ''' if not hasattr(cls, '_instance'): cls._instance = super().__new__(cls) return cls._instance @property def options(self): return { "language_type": "CHN_ENG", "detect_direction": "false", "detect_language": "false", "probability": "true" } def General(self, image, **kwargs): print('调取General_api 识别') return self.client.basicGeneral(image, self.options) def Accurate(self, image): print('调取Accurate_api 识别') return self.client.basicAccurate(image, self.options) def orc(self, image, font_key, word, **kwargs): hash_value = MD5.md5(image) results = self.General(image, **kwargs) if results.get('words_result'): if results.get('words_result') != '*': result = results['words_result'][0]['words'] self.redis.add(hash_value, result) self.redis.hadd(font_key, word, result) return result results = self.Accurate(image) if results.get('words_result'): if results.get('words_result') != '*': result = results['words_result'][0]['words'] self.redis.add(hash_value, result) self.redis.hadd(font_key, word, result) return result if FIXED: '''手动修正''' if not os.path.exists(os.path.join(BASE_DIR, hash_value + '.jpg')): with open(os.path.join(BASE_DIR, hash_value + '.jpg'), 'wb') as f: f.write(image) return '*' def run(self, image, font_key, word, **kwargs): hash_value = MD5.md5(image) if self.redis.exists(hash_value): result = self.redis.get(hash_value) self.redis.hadd(font_key, word, result) return result else: return self.orc(image, font_key, word, **kwargs)
class CookiesGenerator(object): def __init__(self, website='default'): """ 父类,初始化一些对象 :param website: 名称 :param browser: 浏览器,不用可以设置为None """ self.website = website self.cookies_db = RedisClient('cookies', self.website) self.accounts_db = RedisClient('accounts', self.website) self.init_browser() def __del__(self): self.close() def init_browser(self): """ 通过browser参数初始化全局游览器共模拟登录使用 :return: """ if BROWSER_TYPE == 'PhantomJS': caps = DesiredCapabilities.PHANTOMJS caps[ "phantomjs.page.settings.userAgent"] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36' self.browser = webdriver.PhantomJS(desired_capabilities=caps) self.browser.set_window_size(1400, 500) elif BROWSER_TYPE == 'Chrome': self.browser = webdriver.Chrome() def new_cookies(self, username, password): """ 新生成Cookies,子类需要重写 :param username: 用户名 :param password: 密码 :return: """ return NotImplementedError def process_cookies(self, cookies): """ 处理cookies :param cookies: :return: """ dict = {} for cookie in cookies: dict[cookie['name']] = cookie['value'] return dict def run(self): """ 运行,得到所有账户,然后顺次模拟登录 :return: """ accounts_usernames = self.accounts_db.usernames() cookies_usernames = self.cookies_db.usernames() for username in accounts_usernames: if not username in cookies_usernames: password = self.accounts_db.get(username) print('正在生成Cookies', '账号', username, '密码', password) result = self.new_cookies(username, password) # 成功获取 if result.get('status') == 1: cookies = self.process_cookies(result.get('content')) print('成功获取到Cookies', cookies) if self.cookies_db.set(username, json.dumps(cookies)): print('成功保存Cookies') # 密码错误,移除账号 elif result.get('status') == 2: print(result.get('content')) if self.accounts_db.delete(username): print('成功删除账号') else: print(result.get('content')) else: print('所有账号都已经成功获取Cookies') def close(self): """ 关闭 :return: """ try: print('Closing Browser') self.browser.close() del self.browser except TypeError: print('Browser not opened')
class Tester: def __init__(self, website='tianyancha'): """初始化数据库管理对象""" self.website = website self.redis = RedisClient('accounts', self.website) async def test_one_proxy(self, key, proxy): """对目标网站测试一个cookies是否可用""" conn = aiohttp.TCPConnector(ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: # 解码为字符串 headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": proxy[:-1], "Host": "www.tianyancha.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36" } # async with session.get(TEST_URL, headers=headers, timeout=30) as response: try: response = requests.get(TEST_URL, headers=headers, timeout=30) result = response.text html = etree.HTML(result) # print("".join(html.xpath('//div[@class="box -company-box "]/div[@class="content"]/div[@class="header"]/h1[@class="name"]/text()'))) user = "".join( html.xpath('//span[@class="ni-sp-name"]//text()')) print(user, '*' * 20) """"".join(html.xpath('//div[@class="box -company-box "]/div[@class="content"]/div[@class="header"]/h1[@class="name"]/text()'))""" if response.status_code in TRUE_STATUS_CODE and user: # cookie可用 self.redis.max(key, proxy) print(key, 100, '可用') else: # cookie不可用 # send = Send_Click() # staus = send.run(proxy) # if staus: # self.redis.max(key, proxy) # print(key, 100, "通过点字验证") # else: a = self.redis.decrease(key, proxy) print(key, -20, "状态码错误") except Exception as e: print(key, '请求错误', -20, e) except Exception as e: # self.redis.decrease(key, proxy) print(key, '测试错误', -20, e) async def start(self): """启动协程, 测试所有cookies""" try: keys = self.redis.get() for key in keys: if "tianyancha" not in key: proxies = self.redis.all(key) print(key) for i in range(0, len(proxies)): test_proxies = proxies[i:i + BATCH_TEST_SIZE] tasks = [ self.test_one_proxy(key, proxy) for proxy in test_proxies ] asyncio.gather(*tasks) time.sleep(5) else: pass except Exception as e: print('测试器发生错误', e.args) def run(self): asyncio.run(self.start())
class CookiesGenerator(object): def __init__(self, website = 'default'): self.website = website self.cookies_db = RedisClient('cookies', self.website) self.accounts_db = RedisClient('accounts', self.website) self.init_browser() def __del__(self): self.close() def init_browser(self): chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') self.browser = webdriver.Chrome(chrome_options = chrome_options) def new_cookies(self, username, password): """ 新生成Cookies,子类需要重写 :param username: 用户名 :param password: 密码 :return: """ raise NotImplementedError def process_cookies(self, cookies): dict = {} for cookie in cookies: dict[cookie['name']] = cookie['value'] return dict def run(self): account_usernames = self.accounts_db.usernames() cookies_usernames = self.cookies_db.usernames() for username in account_usernames: if not username in cookies_usernames: password = self.accounts_db.get(username) print('Generating new cookies...[username: {} password: {}]'.format(username, password)) result = self.new_cookies(username, password) if result.get('status') == 1: cookies = self.process_cookies(result.get('content')) print('Generated successfully!') if self.cookies_db.set(username, json.dumps(cookies)): print('Saved new cookies successfully!') elif result.get('status') == 2: print(result.get('content')) if self.accounts_db.delete(username): print('Deleted invalid account successfully! [username: {}]'.format(username)) else: print(result.get('content')) else: print('All accounts has got cookies successfully!') def close(self): try: print('Closing browser...') self.browser.close() del self.browser print('Browser has closed!') except TypeError: print('Browser not opened!')
class CookiesGenerator: """ Cookie 生成器基类 """ def __init__(self, site, single_cycle_limit): self.site = site # 单轮登录数量上限 self.single_cycle_limit = single_cycle_limit self.cookies_db = RedisClient('cookies', self.site) self.accounts_db = RedisClient('accounts', self.site) def new_cookies(self, username, password): """ 新生成 Cookies :param username: 用户名 :param password: 密码 :return: """ raise NotImplementedError @staticmethod def process_cookies(cookies): """ 处理 cookies :param cookies: :return: """ return {cookie['name']: cookie['value'] for cookie in cookies} def run(self): """ 运行账号池的所有账号生成 cookie :return: """ accounts_usernames = self.accounts_db.usernames() cookies_usernames = self.cookies_db.usernames() num = 0 for username in accounts_usernames: if num >= self.single_cycle_limit: print('已达单轮登录上限, 停止登录! ') return if username not in cookies_usernames: password = self.accounts_db.get(username).decode('utf-8') username = username.decode('utf-8') print('正在生成 Cookies -> 账号: {}, 密码: {}'.format( username, password)) result = self.new_cookies(username, password) if result.get('status') == '1': if isinstance(result['result'], list): cookies = self.process_cookies(result['result']) else: cookies = result['result'] print('成功生成 Cookies : {}'.format(cookies)) if self.cookies_db.set(username, json.dumps(cookies)): print('成功保存至 Cookie Pool!') else: print('疑似 redis 连接断开, 未成功保存, 尝试调用录入器保存...') set_account( self.site, 'cookies', '{} {}'.format(username, json.dumps(cookies))) # 密码错误, 移除账号 elif result.get('status') == '3': print(result['result']) if self.accounts_db.delete(username): print('删除账号: ', username) else: print('疑似 redis 断开连接, 删除失败, 尝试调用删除器删除...') delete_account(self.site, 'accounts', username) else: print(result.get('result')) num += 1 else: continue sleep_time = random.randint(90, 180) # print('休息{}秒...'.format(sleep_time)) time.sleep(sleep_time) print('所有账号生成完毕! ')
class AipClient(object): ''' 百度识别api ''' def __init__(self, appid, api_key, secrrt_key, redis_url): self.appid = appid self.api_key = api_key self.secrrt_key = secrrt_key self.client = AipOcr(appid, api_key, secrrt_key) self.redis = RedisClient(redis_url) def __new__(cls, *args, **kw): ''' api 单例模式 ''' if not hasattr(cls, '_instance'): cls._instance = super().__new__(cls) return cls._instance @property def options(self): return {"language_type":"CHN_ENG", "detect_direction":"false", "detect_language":"false", "probability":"false"} def General(self, image,**kwargs): print('调取General_api 识别') return self.client.basicGeneral(image, self.options) def Accurate(self, image): print('调取Accurate_api 识别') return self.client.basicAccurate(image, self.options) def orc(self, image, font_key, word, **kwargs): hash_value = MD5.md5(image) results = self.General(image, **kwargs) if results.get('words_result'): if results.get('words_result') != '*': result = results['words_result'][0]['words'] self.redis.add(hash_value, result) self.redis.hadd(font_key, word, result) return result results = self.Accurate(image) if results.get('words_result'): if results.get('words_result') != '*': result = results['words_result'][0]['words'] self.redis.add(hash_value, result) self.redis.hadd(font_key, word, result) return result # Image.open(BytesIO(image)).show() # print(hash_value) return '*' def run(self, image, font_key,word, **kwargs): hash_value = MD5.md5(image) if self.redis.exists(hash_value): result = self.redis.get(hash_value) self.redis.hadd(font_key, word, result) return result else: return self.orc(image, font_key, word, **kwargs)
class CookiesGenerator(): def __init__(self, website="default"): self.website = website self.cookie_db = RedisClient('cookies', self.website) self.account_db = RedisClient('accounts', self.website) self.browser = self.init_browser() def init_browser(self): if BROWSER_TYPE == "PhantomJS": caps = DesiredCapabilities.PHANTOMJS caps[ "phantomjs.page.settings.userAgent"] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36' browser = webdriver.PhantomJS(desired_capabilities=caps) browser.set_window_size(1400, 500) return browser elif BROWSER_TYPE == "Chrome": browser = webdriver.Chrome() return browser def new_cookies(self, username, password): raise NotImplementedError def parse_cookies_dict(self, cookies): dic = {} for cookie in cookies: dic[cookie['name']] = cookie['value'] return dic def run(self): account_usernames = self.account_db.usernames() cookies_usernames = self.cookie_db.usernames() for username in account_usernames: if not username in cookies_usernames: password = self.account_db.get(username) print("正在生成Cookies", username, password) result = self.new_cookies(username, password) if result.get("status") == 1: cookies = self.parse_cookies_dict(result.get('content')) print("成功获取Cookies", cookies) if self.cookie_db.set(username, json.dumps(cookies)): print("成功保存Cookies") elif result.get('status') == 2: print("密码错误") if self.account_db.delete(username): print("删除成功") else: print(result.get("content")) print("所有账号已经成功获取Cookies") def close(self): try: print("Closing Browser") self.browser.close() del self.browser except TypeError: print("Browser not opened") def __del__(self): self.close()