def __init__(self): self.redis_conn = RedisConnection(dbinfo=REDIS).get_conn()
class Resource(object): def __init__(self): self.redis_conn = RedisConnection(dbinfo=REDIS).get_conn() # self.data_conn = MySQLConnection(dbinfo=MYSQL_DB).get_conn() def get_ip(self): try: result = self.redis_conn.llen(HTTPS_PROXY_IP_POOL) if result > 0: ip_li = [] # 使用5个IP多线程同时请求Cookie for i in range(3): res = self.redis_conn.rpoplpush( HTTPS_PROXY_IP_POOL, HTTPS_PROXY_IP_POOL).decode() ip_li.append(res) return ip_li else: logger.info('获取Cookie时请求的IP池为空...') time.sleep(30) except: logger.info('获取Cookie时请求IP池发生错误...') time.sleep(30) def verify_ip(self): proxy = self.get_ip() real_proxy = {'http': proxy, 'https': proxy} url = 'http://www.baidu.com/' res = requests.get(url, proxies=real_proxy) if res.status_code == 200: return proxy else: return False def get_webpage(self): pros = self.get_ip() # pro = 1 if pros: for proxy in pros: options = webdriver.ChromeOptions() # options.binary_location = r"D:\software_location\cent_install\CentBrowser\Application\chrome.exe" options.binary_location = '/usr/bin/google-chrome-stable' # Linux 环境下配置 options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-extensions') options.add_argument('--disable-gpu') logger.info('使用IP %s ...' % (proxy)) options.add_argument("--proxy-server=http://%s" % (proxy)) driver = webdriver.Chrome( executable_path= '/home/seeta/zhangyanchao/chromedriver_install/chromedriver', chrome_options=options) # driver = webdriver.Chrome(executable_path='D:\projects\Weibo_projects\chromedriver.exe', # chrome_options=options) # # 设置代理 # # options.add_argument("--proxy-server=http://%s" % (pro)) # options.add_argument("--proxy-server=http://%s" % ('119.129.236.251:4206')) # # 使用 PhantomJS 初始化 # service_args = ['--proxy=119.129.236.251:4206', '--proxy-type=socks5', ] # driver = webdriver.PhantomJS(r'C:\Users\Administrator\Desktop\phantomjs-2.1.1-windows\phantomjs-2.1.1-windows\bin\phantomjs.exe', service_args=service_args) # # 一定要注意,=两边不能有空格,不能是这样--proxy-server = http://202.20.16.82:10152 # driver = webdriver.Chrome(chrome_options=options) # driver.maximize_window() # # 查看本机ip,查看代理是否起作用 # driver.get("http://httpbin.org/ip") # print(driver.page_source) driver.get('https://live.kuaishou.com/') cookies = driver.get_cookies() # print(driver.page_source) logger.info(cookies) if '未连接到互联网' in driver.page_source or not cookies: logger.info('访问快手首页出现问题, 重新请求新的IP并访问...') driver.quit() pass # self.get_webpage() # 此处可能陷入死循环 else: cookie = {} for res in cookies: name = res['name'] value = res['value'] # print(res['name'], res['value']) cookie[name] = value cookie_d = "{'cookie':" + str( cookie) + '}' # 插入后还有双引号, 可能影响结果 logger.info(cookie_d) # 上传到服务器 self.redis_conn.lpush('ks_cookies', cookie_d) people_li = [ 'dagaoge666', 'meishi123458888', 'wangzulan', '3xgghwn46skhkxa', 'sanda927', 'hs1590ai', 'xiaoyiyi', '3xjb64qxiwbv2dm', 'huangbo666', 'Sanmei1997' ] driver.get('https://live.kuaishou.com/profile/%s' % (random.choice(people_li))) logger.info('获取首页成功, 刷新页面...') for i in range(2): driver.refresh() # 退出,清除浏览器缓存 time.sleep(5) driver.quit() logger.info('退出浏览器 ...') def scan_verify_cookie_pool(self, cookie_pool): logger.info("%s代理池资源总数:%s" % (cookie_pool, self.redis_conn.llen(cookie_pool))) valid_count = 0 invalid_count = 0 test_count = 1 date_format = '%Y-%m-%d %H:%M:%S' while True: try: result = self.redis_conn.llen(cookie_pool) if result < FILL_RESOURCE_THRESHOLD: self.get_webpage() time.sleep(7) logger.info('检测代理池时间...') except: logger.info('获取页面时程序发生错误 ...') time.sleep(10)
class Resource(object): def __init__(self): self.redis_conn = RedisConnection(dbinfo=REDIS).get_conn() # self.data_conn = MySQLConnection(dbinfo=MYSQL_DB).get_conn() def request_proxy_ip(self, url, number): # request_url = url + '&count=' + str(number) # 蘑菇代理 request_url = url + '&num=' + str(number) # 芝麻代理 # request_url = url + '&number=' + str(number) # 快代理 try: response = requests.get(request_url, headers=base_headers) if response.status_code == 200: result = json.loads(response.text) logger.info(result) # if result.get('success'): if result.get('data'): logger.info('%s 成功获取%s个代理' % (time.asctime(), len(result['data']))) print(result['data']) return result['data'] # proxies are returned here else: logger.error('%s 获取%s个代理失败,URL: %s' % (time.asctime(), number, request_url)) except Exception as e: logger.error('%s 获取%s个代理失败,URL: %s' % (time.asctime(), number, request_url)) logger.error('错误:%s' % e) def load_proxy_pool(self, protocol, proxies): for proxy in proxies: if protocol == 'https': # proxy_item = {"port": proxy, "user": '******', "pass": '******', "status": "A", "enter_time": int(time.time()), "invalid_time": 0, "success": 0, "fail": 0} proxy_item = '%s:%s' % (proxy['ip'], proxy['port']) # self.redis_conn.hset(HTTPS_PROXY_IP_POOL, proxy['ip'], str(proxy_item)) self.redis_conn.lpush(HTTPS_PROXY_IP_POOL, proxy_item) logger.info("Added new proxy %s:%s into proxy pool: %s" % (proxy['ip'], proxy['port'], HTTPS_PROXY_IP_POOL)) def get_new_proxy(self, number=PROXY_REQUEST_CHUNK): # 请求新的代理IP logger.info("request new proxy IP") proxies = self.request_proxy_ip(PROXY_REQUEST_URL, number) if proxies: logger.info("load new proxy IP") self.load_proxy_pool('https', proxies) # self.load_proxy_into_db(proxies) def scan_verify_proxy_pool(self, proxy_pool): logger.info("%s代理池资源总数:%s" % (proxy_pool, self.redis_conn.llen(proxy_pool))) valid_count = 0 invalid_count = 0 test_count = 1 date_format = '%Y-%m-%d %H:%M:%S' while True: try: result = self.redis_conn.llen(proxy_pool) if result < FILL_RESOURCE_THRESHOLD: self.get_new_proxy() time.sleep(10) print('检测代理池时间...') except: time.sleep(30) def get_resource_pool_size(self): return self.redis_conn.llen(BXS_RESOURCE_POOL)
class Resource(object): def __init__(self): self.redis_conn = RedisConnection(dbinfo=REDIS).get_conn() # self.data_conn = MySQLConnection(dbinfo=MYSQL_DB).get_conn() def get_ip(self): try: result = self.redis_conn.llen(HTTPS_PROXY_IP_POOL) if result > 0: ip_li = [] # 使用5个IP多线程同时请求Cookie for i in range(1): res = self.redis_conn.rpoplpush( HTTPS_PROXY_IP_POOL, HTTPS_PROXY_IP_POOL).decode() ip_li.append(res) return ip_li else: logging.warning('获取Cookie时请求的IP池为空...') time.sleep(30) except: logging.warning('获取Cookie时请求IP池发生错误...') time.sleep(30) def verify_ip(self): proxy = self.get_ip() real_proxy = {'http': proxy, 'https': proxy} url = 'http://www.baidu.com/' res = requests.get(url, proxies=real_proxy) if res.status_code == 200: return proxy else: return False def get_webpage(self, cookiess): pros = self.get_ip() # pros = [1] if pros: for proxy in pros: options = webdriver.ChromeOptions() # options.binary_location = r"D:\software_location\cent_install\CentBrowser\Application\chrome.exe" options.binary_location = '/usr/bin/google-chrome-stable' # Linux 环境下配置 options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-extensions') options.add_argument('--disable-gpu') options.add_argument("--proxy-server=http://%s" % (proxy)) # options.add_argument("--proxy-server=http://%s" % ('60.182.165.112:4231')) driver = webdriver.Chrome( executable_path= '/home/seeta/zhangyanchao/chromedriver_install/chromedriver', chrome_options=options) # driver = webdriver.Chrome(executable_path='D:\projects\Weibo_projects\chromedriver.exe', # chrome_options=options) # 查看本机ip,查看代理是否起作用 # driver.set_page_load_timeout(5) driver.get( "http://httpbin.org/ip" ) # 技术提示:必须首先加载网站,这样Selenium 才能知道cookie 属于哪个网站,即使加载网站的行为对我们没任何用处 if '未连接到互联网' in driver.page_source: logging.warning('访问快手首页出现问题, 关闭浏览器...') driver.quit() logging.warning('把 Cookie 重新放入 invalid_cookies 池中...') self.redis_conn.lpush(INVALID_KS_COOKIE_POOL, cookiess) else: # driver.get("http://www.baidu.com/") try: expires = str(self.get_time()) for k, v in cookiess['cookie'].items(): driver.add_cookie({ 'domain': '.kuaishou.com', 'name': k, 'value': v, 'path': '/', 'expires': expires }) people_li = [ 'dagaoge666', 'TS-J0315J', 'meishi123458888', 'lol1314666', 'travelers', 'dingdang660', 'xue66666', 'wangzulan', '3xgghwn46skhkxa', 'sanda927', 'hs1590ai', 'xiaoyiyi', '3xjb64qxiwbv2dm', 'huangbo666', 'Sanmei1997' ] driver.get('https://live.kuaishou.com/profile/%s' % (random.choice(people_li))) if '未连接到互联网' in driver.page_source: logging.warning('访问快手首页出现问题, 关闭浏览器...') driver.quit() logging.warning( '把 Cookie 重新放入 invalid_cookies 池中...') self.redis_conn.lpush(INVALID_KS_COOKIE_POOL, str(cookiess)) else: for i in range(2): time.sleep(0.3) driver.refresh() # 刷新后重新插入 Cookies 池 logging.info('使用 %s 刷新成功, 放入 正常Cookies池中...' % (cookiess)) print('使用 %s 刷新成功, 放入 正常Cookies池中...' % (cookiess)) self.redis_conn.lpush(HTTPS_PROXY_COOKIE_POOL, str(cookiess)) # 退出,清除浏览器缓存 time.sleep(4) driver.quit() except Exception as e: logging.warning('赋值 Cookie 时发生错误 %s' % (e)) logging.warning('把 Cookie 重新放入 invalid_cookies 池中...') self.redis_conn.lpush(INVALID_KS_COOKIE_POOL, str(cookiess)) driver.quit() def scan_verify_cookie_pool(self, cookie_pool): logger.info("%s代理池资源总数:%s" % (cookie_pool, self.redis_conn.llen(cookie_pool))) valid_count = 0 invalid_count = 0 test_count = 1 date_format = '%Y-%m-%d %H:%M:%S' while True: try: result = self.redis_conn.llen(cookie_pool) # 后期如果重复严重改成 set if result > 0: cookiess = json.loads( self.redis_conn.rpop(cookie_pool).decode().replace( "'", '"')) # self.redis_conn.lpush(cookie_pool, str(cookiess)) # 测试使用 self.get_webpage(cookiess) else: time.sleep(3) # 如果 Cookie 淤积太多修改睡眠时间 logging.warning('检测代理池时间...') except Exception as e: print(e) time.sleep(3) def get_time(self): bb = datetime.datetime.now() + datetime.timedelta(days=365) d = datetime.datetime.strptime(str(bb), "%Y-%m-%d %H:%M:%S.%f") t = d.timetuple() timeStamp = int(time.mktime(t)) timeStamp = float(str(timeStamp) + str("%06d" % d.microsecond)) / 1000000 print('timeStamp', timeStamp) return timeStamp