def freeProxyWallThird(): urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] request = WebRequest() for url in urls: r = request.get(url) proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text) for proxy in proxies: yield ':'.join(proxy)
def freeProxyWallSecond(): urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] request = WebRequest() import base64 for url in urls: r = request.get(url) proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) for proxy in proxies: yield base64.b64decode(proxy).decode()
def freeProxyWallThird(): urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] request = WebRequest() for url in urls: r = request.get(url) proxies = re.findall( r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text) for proxy in proxies: yield ':'.join(proxy)
def run(self): urls = ['http://www.ip3366.net/free/'] request = WebRequest() for url in urls: r = request.get(url) proxies = re.findall( r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text) for proxy in proxies: yield ":".join(proxy)
def run(self): urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] request = WebRequest() for url in urls: r = request.get(url) proxies = re.findall( r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W]<td>(\d+)</td>', r.text) for proxy in proxies: yield ':'.join(proxy)
def freeProxySixth(): """ 讯代理 http://www.xdaili.cn/ :return: """ url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10' request = WebRequest() # try: res = request.get(url).json() for row in res['RESULT']['rows']: yield '{}:{}'.format(row['ip'], row['port'])
def freeProxy18(): """ 小舒代理 http://www.xsdaili.cn :return: """ base_url = 'http://www.xsdaili.cn' request = WebRequest() r = request.get(base_url, timeout=10) urls = re.findall(r'<a href="([^"]+)"[^>]+', r.text) for url in urls: if 'dayProxy' in url: url_detail = base_url.format(url) print(url_detail) r_detail = request.get(url_detail, timeout=10) proxys = re.findall( '(\d{2,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{2,5})', r_detail.text) for proxy in proxys: yield ':'.join(proxy) break
def freeProxy15(): urls = ['http://www.xiladaili.com/putong/', "http://www.xiladaili.com/gaoni/", "http://www.xiladaili.com/http/", "http://www.xiladaili.com/https/"] request = WebRequest() for url in urls: r = request.get(url, timeout=10) ips = re.findall(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}", r.text) for ip in ips: yield ip.strip()
def freeProxyWallFourth(): urls = ['https://free-proxy-list.net'] request = WebRequest() for url in urls: r = request.get(url) html = etree.HTML(r.text) tbody = html.xpath('//tbody')[0] proxies = [':'.join([tr[0].text, tr[1].text]) for tr in tbody] proxies = list(set(proxies)) for proxy in proxies: yield proxy
def getCheckerproxy(): str = datetime.datetime.today().strftime('%Y-%m-%d') url = 'https://checkerproxy.net/api/archive/{}'.format(str) request = WebRequest() try: res = request.get(url, timeout=10).json() for row in res: if row['type'] == 2 and row['kind'] == 2: yield row['addr'] except Exception as e: pass
def run(self): urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1'] request = WebRequest() for url in urls: r = request.get(url) proxies = re.findall( 'data-ip="(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})".+?>(\d+)</td>', r.text) for proxy in proxies: yield ':'.join(proxy)
def freeProxyTen(): """ 云代理 http://www.ip3366.net/free/ :return: """ urls = ['http://www.ip3366.net/free/'] request = WebRequest() for url in urls: r = request.get(url) proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text) for proxy in proxies: yield ":".join(proxy)
def freeProxySecond(proxy_number=100): """ 抓取代理66 http://www.66ip.cn/ :param proxy_number: 代理数量 :return: """ url = "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format( proxy_number) request = WebRequest() html = request.get(url).text for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html): yield proxy
def freeProxyWallFirst(): """ 墙外网站 cn-proxy :return: """ urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] request = WebRequest() for url in urls: r = request.get(url) proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W]<td>(\d+)</td>', r.text) for proxy in proxies: yield ':'.join(proxy)
def freeProxyNinth(): """ 码农代理 https://proxy.coderbusy.com/ 已停用 :return: """ urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1'] request = WebRequest() for url in urls: r = request.get(url, timeout=10) proxies = re.findall('data-ip="(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})".+?>(\d+)</td>', r.text) for proxy in proxies: yield ':'.join(proxy)
def freeProxyNinth(): """ 码农代理 https://proxy.coderbusy.com/ :return: """ urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1'] request = WebRequest() for url in urls: r = request.get(url) proxies = re.findall('data-ip="(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})".+?>(\d+)</td>', r.text) for proxy in proxies: yield ':'.join(proxy)
def freeProxyTen(): """ 云代理 http://www.ip3366.net/free/ :return: """ urls = ['http://www.ip3366.net/free/?stype=1&page={}'.format(str(i)) for i in range(1, 4)] request = WebRequest() for url in urls: r = request.get(url) proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text) for proxy in proxies: yield ":".join(proxy)
def freeProxy17(): """ 小幻HTTP代理 https://ip.ihuan.me :return: """ base_url = 'https://ip.ihuan.me{}' request = WebRequest() url = base_url.format('/today.html') r = request.get(url, timeout=10) urls = re.findall(r'<a href="([^"]+)"[^>]+', r.text) for url in urls: if 'today' in url: url_detail = base_url.format(url) print(url_detail) r_detail = request.get(url_detail, timeout=10) proxys = re.findall( '(\d{2,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{2,5})', r_detail.text) for proxy in proxys: yield ':'.join(proxy) break
def freeProxyEight(): url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)] # 国内高匿 url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 10)] # 国内普匿 url_gntou = ['http://www.mimiip.com/gntou/%s' % n for n in range(1, 10)] # 国内透明 url_list = url_gngao + url_gnpu + url_gntou request = WebRequest() for url in url_list: r = request.get(url) proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W].*<td>(\d+)</td>', r.text) for proxy in proxies: yield ':'.join(proxy)
def run(self): urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] request = WebRequest() import base64 for url in urls: r = request.get(url) if r.status_code != 200: break proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) for proxy in proxies: yield base64.b64decode(proxy).decode()
def getHtmlTree(url, proxy_ip): # TODO 取代理服务器用代理服务器访问 wr = WebRequest() # delay 2s for per request time.sleep(2) # ip, port, prot = proxy_ip.split(':') # proxies = {prot: '{}://{}:{}'.format(prot, ip, port)} # html = wr.get(url, proxies=None).text return BeautifulSoup(html, features='lxml')
def freeProxyTen(): urls = [ "http://www.ip3366.net/free/?stype=1", "http://www.ip3366.net/free/?stype=2", "http://www.ip3366.net/free/?stype=3", "http://www.ip3366.net/free/?stype=4", ] request = WebRequest() for url in urls: r = request.get(url) proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text) for proxy in proxies: yield ":".join(proxy)
def freeProxyWallSecond(): """ https://proxy-list.org/english/index.php :return: """ urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] request = WebRequest() import base64 for url in urls: r = request.get(url) proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) for proxy in proxies: yield base64.b64decode(proxy).decode()
def freeNordVPN(): """ nordvpn.com :return: """ url = 'https://nordvpn.com/wp-admin/admin-ajax.php?searchParameters%5B0%5D%5Bname%5D=proxy-country&searchParameters%5B0%5D%5Bvalue%5D=&searchParameters%5B1%5D%5Bname%5D=proxy-ports&searchParameters%5B1%5D%5Bvalue%5D=&searchParameters%5B3%5D%5Bname%5D=https&searchParameters%5B3%5D%5Bvalue%5D=on&limit=50000&action=getProxies&offset=0' request = WebRequest() try: res = request.get(url, timeout=10).json() for row in res: yield '{}:{}'.format(row['ip'], row['port']) except Exception as e: pass
def freeProxy18(): """ 站大爷 https://www.zdaye.com/dayProxy.html :return: """ base_url = 'https://www.zdaye.com{}' request = WebRequest() url = base_url.format('/dayProxy.html') print(url) r = request.get(url, timeout=10) urls = re.findall(r'<a href="([^"]+)"+', r.text) for url in urls: if 'dayProxy' in url: url_detail = base_url.format(url) print(url_detail) r_detail = request.get(url_detail, timeout=10) proxys = re.findall( '(\d{2,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{2,5})', r_detail.text) for proxy in proxys: print(proxy) break
def freeProxySixth(): """ 抓取讯代理免费proxy http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10 :return: """ url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10' request = WebRequest() try: res = request.get(url).json() for row in res['RESULT']['rows']: yield '{}:{}'.format(row['ip'], row['port']) except Exception as e: pass
def freeProxyCustumOne(): """ 三一代理 http://31f.cn/ :return: """ url = 'http://31f.cn/' request = WebRequest() r = request.get(url, timeout=10) proxies = re.findall( r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text) for proxy in proxies: yield ":".join(proxy)
def myFreeProxy3(): """ """ urls = [ "https://api.proxyscrape.com?request=displayproxies&proxytype=http&timeout=7000&country=DE&anonymity=elite", "https://www.proxy-list.download/api/v1/get?type=https&anon=elite", ] request = WebRequest() for url in urls: sourcecode = request.get(url, timeout=10) for line in sourcecode.text.splitlines(): yield line
def freeProxy_shenji(): """ 神鸡代理 http://www.shenjidaili.com/open/ """ try: url = 'http://www.shenjidaili.com/open/' request = WebRequest() r = request.get(url, timeout=10) proxies = re.findall(r'<td>(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}:\d{1,5})</td>', r.text) for proxy in proxies: yield proxy except Exception as e: print(e)
def freeProxySixth(): """ 讯代理 http://www.xdaili.cn/ 网站不再提供免费代理 :return: """ url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10' request = WebRequest() try: res = request.get(url, timeout=10).json() for row in res['RESULT']['rows']: yield '{}:{}'.format(row['ip'], row['port']) except Exception as e: pass
def freeProxySecond(proxy_number=100): """ 抓取代理66 http://www.66ip.cn/ :param proxy_number: 代理数量 :return: """ url = "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format( proxy_number) request = WebRequest() # html = request.get(url).content # content为未解码,text为解码后的字符串 html = request.get(url).text for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html): yield proxy
def freeProxyCustumTwo(page_count=2): """ 免费代理IP库 http://www.89ip.cn/ :return: """ for i in range(1, page_count + 1): url = 'http://www.89ip.cn/index_{}.html'.format(i) request = WebRequest() r = request.get(url, timeout=10) proxies = re.findall( r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td><td>(\d+)</td>', re.sub(r'[\t\n]', '', r.text)) for proxy in proxies: yield ":".join(proxy)
def freeProxySixth(): """ 抓取讯代理免费proxy http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10 :return: """ url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=100' request = WebRequest() try: res = request.get(url).json() for row in res['RESULT']['rows']: yield GetFreeProxy.compose(row['ip'], row['port'], row['anony'], row['type']) except Exception as e: self.log.warning("fetch proxy failed: " + str(e))
def freeProxyEight(): """ 89ip http://www.89ip.cn/index.html """ url_list = [ "http://www.89ip.cn/tqdl.html?api=1&num=30&port=&address=&isp=" ] request = WebRequest() for url in url_list: r = request.get(url, timeout=10) proxies = re.findall( r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d+)<br>', r.text) for proxy in proxies: yield proxy
def freeProxyEleven(): urls = [ 'http://www.iphai.com/free/ng', 'http://www.iphai.com/free/np', 'http://www.iphai.com/free/wg', 'http://www.iphai.com/free/wp' ] request = WebRequest() for url in urls: r = request.get(url) proxies = re.findall(r'<td>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?</td>[\s\S]*?<td>\s*?(\d+)\s*?</td>', r.text) for proxy in proxies: yield ":".join(proxy)
def freeProxyEight(): """ 秘密代理 http://www.mimiip.com """ url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)] # 国内高匿 url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 10)] # 国内普匿 url_gntou = ['http://www.mimiip.com/gntou/%s' % n for n in range(1, 10)] # 国内透明 url_list = url_gngao + url_gnpu + url_gntou request = WebRequest() for url in url_list: r = request.get(url, use_proxy=True) proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W].*<td>(\d+)</td>', r.text) for proxy in proxies: yield ':'.join(proxy)
def freeProxyEleven(): """ IP海 http://www.iphai.com/free/ng :return: """ urls = [ 'http://www.iphai.com/free/ng', 'http://www.iphai.com/free/np', 'http://www.iphai.com/free/wg', 'http://www.iphai.com/free/wp' ] request = WebRequest() for url in urls: r = request.get(url) proxies = re.findall(r'<td>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?</td>[\s\S]*?<td>\s*?(\d+)\s*?</td>', r.text) for proxy in proxies: yield ":".join(proxy)
def getHtmlTree(url, **kwargs): """ 获取html树 :param url: :param kwargs: :return: """ header = {'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', } # TODO 取代理服务器用代理服务器访问 wr = WebRequest() # delay 2s for per request time.sleep(2) html = wr.get(url=url, header=header).content return etree.HTML(html)