async def proxy_site_66ip(): try: # 遍历全国的代理 for i in range(0, 35): if i == 0: url = 'http://www.66ip.cn/' else: url = 'http://www.66ip.cn/areaindex_{area}/'.format(area=i) # 获得前5页IP for j in range(1, 6): url_page = url + '{page}.html'.format(page=j) async with aiohttp.ClientSession() as session: async with session.get(url=url_page, headers=HEADERS) as response: r = await response.text() if r is not None and '' != r: doc = PyQuery(r) for each in doc('tr').items(): ip_address = each('td').eq(0).text() pattern = re.compile( '\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') match = pattern.match(ip_address) if match: ip_port = each('td').eq(1).text() # 非异步,待解决 await SpiderProxy.verify_and_save( ip_address + ':' + ip_port, '66ip.cn') except Exception as e: Util.log_error(e)
async def proxy_site_kuaidaili(): try: header = HEADERS header[ 'Cookie'] = '_ydclearance=b673c31fc021172e1d425859-915a-4546-bd2d-d88b08d45818-1491886397' for i in range(1, 6): url = 'http://www.kuaidaili.com/proxylist/{page}/'.format( page=i) async with aiohttp.ClientSession() as session: async with session.get(url=url, headers=header) as response: r = await response.text() if r is not None and '' != r: doc = PyQuery(r) for each in doc('#index_free_list > table > tbody')( 'tr').items(): ip_address = each('td[data-title=IP]').eq(0).text() ip_port = each('td[data-title=PORT]').eq(0).text() # 非异步,待解决 await SpiderProxy.verify_and_save( ip_address + ':' + ip_port, 'kuaidaili.com') pass except Exception as e: Util.log_error(e)
async def proxy_site_xici(): try: # 高匿、普匿、HTTPS、HTTP forms = ['nn', 'nt', 'wn', 'wt'] for type in forms: # 每种类型的前10页 for page in range(1, 11): url = 'http://www.xicidaili.com/' + type + '/' + str(page) header = HEADERS async with aiohttp.ClientSession() as session: async with session.get(url=url, headers=header) as response: r = await response.text() if r is not None and '' != r: doc = PyQuery(r) for each in doc('tr').items(): ip_address = each('td').eq(1).text() pattern = re.compile( '\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') match = pattern.match(ip_address) if match: ip_port = each('td').eq(2).text() # 非异步,待解决 await SpiderProxy.verify_and_save( ip_address + ':' + ip_port, 'xicidaili.com') except Exception as e: Util.log_error(e)
async def proxy_site_goubanjia(): try: for index in range(1, 6): url = 'http://www.goubanjia.com/free/index{0}.shtml'.format( index) header = HEADERS async with aiohttp.ClientSession() as session: async with session.get(url=url, headers=header) as response: r = await response.text() if r is not None and '' != r: doc = PyQuery(r) for each in doc('td.ip').items(): address = '' for i, tag in enumerate(each('*').items()): if i == 0: continue if (tag.attr.style and 'none;' in tag.attr.style) or '' == tag.text(): continue if 'port' in str(tag): address = address + ':' + tag.text() else: address += tag.text() pattern = re.compile( '\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}') match = pattern.match(address) if match: # 非异步,待解决 await VerifySave.verify_and_save( address, 'guobanjia.com') pass except Exception as e: Util.log_error('proxy_site_goubanjia: ' + str(e))
def execute(self, sql): try: with self.connection.cursor() as cursor: cursor.execute(sql) self.connection.commit() except Exception as e: Util.log_error(e) finally: self.connection.close()
def query_one(self, sql): try: with self.connection.cursor() as cursor: cursor.execute(sql) return cursor.fetchone() except Exception as e: Util.log_error(e) return None finally: self.connection.close()
def checking(self, sql): try: fetchall = MySql(DB_ADDRESS, DB_USER, DB_PASS, DB_DATABASE, DB_CHARSET).query(sql) if fetchall is None: sql = 'SELECT * FROM httpbin WHERE alive=0 ORDER BY update_time' fetchall = MySql(DB_ADDRESS, DB_USER, DB_PASS, DB_DATABASE, DB_CHARSET).query(sql) if fetchall is None: time.sleep(10) return for each in fetchall: each_id = each[0] proxy = each[1] + ':' + each[2] verify_count = int(each[12]) + 1 r = VerifyProxy().validate_proxy(proxy, protocol='http', timeout=3) threadLock.acquire() if isinstance(r, str): # 失效 sql = 'SELECT leave_count FROM httpbin WHERE id={0}'.format( each_id) fetchone = MySql(DB_ADDRESS, DB_USER, DB_PASS, DB_DATABASE, DB_CHARSET).query_one(sql) alive = '0' leave_count = int(fetchone[0]) - 1 if leave_count <= 0: # 尝试20次之后,删除该条代理 sql = 'DELETE FROM httpbin WHERE id={0}'.format( each_id) Util.log_error(sql) else: # 更新 sql = "UPDATE httpbin SET verify_count={0}, leave_count={1}, alive={2} WHERE id={3}" \ .format(verify_count, leave_count, alive, each_id) Util.log_info(sql) MySql(DB_ADDRESS, DB_USER, DB_PASS, DB_DATABASE, DB_CHARSET).execute(sql) elif isinstance(r, dict): if 'exception' not in r.keys(): alive = '1' leave_count = 20 speed = r['timedelta'] # 暂不重复验证是否支持HTTPS # result_https = VerifyProxy().validate_proxy(proxy, 'https', timeout=3) sql = "UPDATE httpbin SET verify_count={0}, speed={1}, alive={2}, leave_count={3} WHERE id={4}" \ .format(verify_count, speed, alive, leave_count, each_id) Util.log_info(sql) MySql(DB_ADDRESS, DB_USER, DB_PASS, DB_DATABASE, DB_CHARSET).execute(sql) threadLock.release() except Exception as e: Util.log_error(e)
async def proxy_site_66ip_api(): try: url = 'http://m.66ip.cn/mo.php?tqsl=3000' async with aiohttp.ClientSession() as session: async with session.get(url=url, headers=HEADERS) as response: r = await response.text() if r is not None and '' != r: pattern = re.compile( '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5})<br />') match = pattern.findall(r) if len(match) > 0: for each in match: # 非异步,待解决 await SpiderProxy.verify_and_save(each, 'm.66ip.cn') except Exception as e: Util.log_error(e)
async def proxy_site_ip181(): try: url = 'http://www.ip181.com' header = HEADERS header['Host'] = 'www.ip181.com' async with aiohttp.ClientSession() as session: async with session.get(url=url, headers=header) as response: r = await response.text(encoding='gb2312') if r is not None and '' != r: doc = PyQuery(r) for each in doc('tr').items(): ip_address = each('td').eq(0).text() pattern = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') match = pattern.match(ip_address) if match: ip_port = each('td').eq(1).text() # 非异步,待解决 await SpiderProxy.verify_and_save( ip_address + ':' + ip_port, 'ip181.com') except Exception as e: Util.log_error(e)
async def proxy_site_mimvp(): # 国内普通、高匿,国外普通、高匿 start_urls = ['in_tp', 'in_hp', 'out_tp', 'out_hp'] try: for url in start_urls: url = 'http://proxy.mimvp.com/free.php?proxy=' + url header = HEADERS async with aiohttp.ClientSession() as session: async with session.get(url=url, headers=header) as response: r = await response.text() if r is not None and '' != r: r = r.replace('<tbody>', '<tbody><tr>').replace('</tr>', '</tr><tr>') doc = PyQuery(r) for each in doc('tbody')('tr').items(): address = each('td').eq(1).text() pattern = re.compile( '\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') match = pattern.match(address) if match: port_img_url = each('td').eq(2)('img').attr.src if 'common' in port_img_url: port_img_url = 'http://proxy.mimvp.com/' + port_img_url r = requests.get(port_img_url, headers=header) image = Image.open(BytesIO(r.content)) port = Util.image_to_str(image) try: port = int(port) # 非异步,待解决 await VerifySave.verify_and_save( address + ':' + str(port), 'mimvp.com') except: continue except Exception as e: Util.log_error('proxy_site_mimvp: ' + str(e))
async def proxy_site_kuaidaili(): try: header = HEADERS header[ 'Cookie'] = 'channelid=0; sid=1492138570160333; _gat=1; _ga=GA1.2.486444152.1491549216' for i in range(1, 6): url = 'http://www.kuaidaili.com/proxylist/{page}/'.format( page=i) async with aiohttp.ClientSession() as session: async with session.get(url=url, headers=header) as response: r = await response.text() if r is not None and '' != r: doc = PyQuery(r) for each in doc('#freelist > table > tbody')('tr').items(): ip_address = each('td[data-title=IP]').eq(0).text() ip_port = each('td[data-title=PORT]').eq(0).text() # 非异步,待解决 await VerifySave.verify_and_save( ip_address + ':' + ip_port, 'kuaidaili.com') pass except Exception as e: Util.log_error('proxy_site_kuaidaili: ' + str(e))