def freeProxyWallThird(): urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] request = WebRequest() for url in urls: r = request.get(url, timeout=10) proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text) for proxy in proxies: yield ':'.join(proxy)
def testWebRequest(): """ test class WebRequest in Util/WebRequest.py :return: """ wr = WebRequest() request_object = wr.get('https://www.baidu.com/') assert request_object.status_code == 200
def freeProxy15_us(): url = 'https://www.us-proxy.org' request = WebRequest() r = request.get(url, timeout=10) proxies = re.findall( r'<td>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?</td>[\s\S]*?<td>\s*?(\d+)\s*?</td>', r.text) for proxy in proxies: yield ':'.join(proxy)
def freeProxyCustumThree(): url = 'http://lab.crossincode.com/proxy/' request = WebRequest() r = request.get(url, timeout=10) proxies = re.findall( r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text) for proxy in proxies: yield ":".join(proxy)
def freeProxySixth(): url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10' request = WebRequest() try: res = request.get(url).json() for row in res['RESULT']['rows']: yield '{}:{}'.format(row['ip'], row['port']) except Exception as e: pass
def get_html_tree(url): """ 获取html树 :param url: :return: """ wr = WebRequest() html = wr.get(url).content return etree.HTML(html)
def freeProxy_31f(): urls = ['http://31f.cn/http-proxy/'] request = WebRequest() for url in urls: r = request.get(url, timeout=10) p_raw = re.findall(r'table table-striped([\s\S]+?)</table>', r.text)[0] proxies = re.findall('<td>\d{1,2}</td>[^<]<td> *([\d\.]+) *</td>[^<]<td> *([\d]+) *</td>', p_raw) for proxy in proxies: yield ':'.join(proxy)
def run(self): urls = ['http://www.ip3366.net/free/'] request = WebRequest() for url in urls: r = request.get(url) proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text) for proxy in proxies: yield ":".join(proxy)
def freeProxyWallSecond(): urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] request = WebRequest() import base64 for url in urls: r = request.get(url) proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) for proxy in proxies: yield base64.b64decode(proxy).decode()
def myFreeProxy2(): """ """ # urls vari nurls = [ "http://www.aliveproxy.com/high-anonymity-proxy-list/", "http://www.aliveproxy.com/anonymous-proxy-list/", "http://www.aliveproxy.com/fastest-proxies/", "http://www.aliveproxy.com/us-proxy-list/", "http://www.aliveproxy.com/gb-proxy-list/", "http://www.aliveproxy.com/fr-proxy-list/", "http://www.aliveproxy.com/de-proxy-list/", "http://www.aliveproxy.com/jp-proxy-list/", "http://www.aliveproxy.com/ca-proxy-list/", "http://www.aliveproxy.com/ru-proxy-list/", "http://www.aliveproxy.com/proxy-list-port-80/", "http://www.aliveproxy.com/proxy-list-port-81/", "http://www.aliveproxy.com/proxy-list-port-3128/", "http://www.aliveproxy.com/proxy-list-port-8000/", "http://www.aliveproxy.com/proxy-list-port-8080/", "http://webanetlabs.net/publ/24", "http://www.proxz.com/proxy_list_high_anonymous_0.html", "http://www.proxz.com/proxy_list_anonymous_us_0.html", "http://www.proxz.com/proxy_list_uk_0.html", "http://www.proxz.com/proxy_list_ca_0.html", "http://www.proxz.com/proxy_list_cn_ssl_0.html", "http://www.proxz.com/proxy_list_jp_0.html", "http://www.proxz.com/proxy_list_fr_0.html", "http://www.proxz.com/proxy_list_port_std_0.html", "http://www.proxz.com/proxy_list_port_nonstd_0.html", "http://www.proxz.com/proxy_list_transparent_0.html", "http://www.proxylists.net/", "https://www.my-proxy.com/free-proxy-list.html", "https://www.my-proxy.com/free-elite-proxy.html", "https://www.my-proxy.com/free-anonymous-proxy.html", "https://www.my-proxy.com/free-transparent-proxy.html", "https://jffjdjkbfek.000webhostapp.com/proxy.txt", "https://cyber-hub.net/proxy/http.txt", "http://spys.one/en/https-ssl-proxy", "http://spys.one/en/http-proxy-list/", "https://hidemy.name/en/proxy-list/", "http://www.httptunnel.ge/ProxyListForFree.aspx", "https://list.proxylistplus.com/SSL-List-1", "https://www.torvpn.com/en/proxy-list", ] request = WebRequest() for url in nurls: sourcecode = request.get(url, timeout=10) res_content = sourcecode.text proxies = re.findall( r'<td.*?>[\s\S]*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?</td>[\s\S]*?<td.*?>[\s\S]*?(\d+)[\s\S]*?</td>', res_content) for proxy in proxies: yield ':'.join(proxy)
def freeProxy12th(): urls = ['http://www.ip181.com/daili/%s.html' % n for n in range(1, 11)] request = WebRequest() for url in urls: r = requests.get(url) proxies = re.findall( '<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W]*?<td>(\d+)</td>', r.content) for proxy in proxies: yield ':'.join(proxy)
def freeProxyNight(): urls = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)] request = WebRequest() for url in urls: r = requests.get(url) proxies = re.findall( '<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W].*<td>(\d+)</td>', r.content) for proxy in proxies: yield ':'.join(proxy)
def freeProxyEight(): urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] request = WebRequest() for url in urls: r = requests.get(url) proxies = re.findall( '<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W]<td>(\d+)</td>', r.content) for proxy in proxies: yield ':'.join(proxy)
def freeProxyFifteen(): """ 这个要牛逼上天了啊 http://www.proxylists.net """ url = 'http://www.proxylists.net/proxylists.xml' request = WebRequest() text = request.get(url).text proxy = re.findall('<prx:ip>(.*?)<.*?port>(.*?)<', text) for x, y in proxy: yield x + ':' + y
def get_proxy_two(): """ 采集:http://www.xdaili.cn/freeproxy :return: """ url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10' request = WebRequest() res = request.get(url).json() for row in res['RESULT']['rows']: yield '{}:{}'.format(row['ip'], row['port'])
def freeProxySeventeen(): """ http://www.89ip.cn/tqdl.html?num=9999 """ url = "http://www.89ip.cn/tqdl.html?num=9999" request = WebRequest() text = request.get(url).text proxy_list = re.findall(r">\s+([\d\.:]*?)<br>", text, re.S) for proxy in proxy_list: yield proxy
def freeProxy_ZGIP(): urls = ['https://cn-proxy.com/'] request = WebRequest() for url in urls: r = request.get(url, timeout=10) p_raw = re.findall(r'<table class="sortable">([\s\S]+?)</table>', r.text) for proxy_table in p_raw: proxies = re.findall('<td> ?([\d\.]+) ?</td>[^<]<td> ?([\d]+)', proxy_table) for proxy in proxies: yield ':'.join(proxy)
def run(self): urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] request = WebRequest() for url in urls: r = request.get(url) proxies = re.findall( r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W]<td>(\d+)</td>', r.text) for proxy in proxies: yield ':'.join(proxy)
def freeProxyWallFourth(): urls = ['https://free-proxy-list.net'] request = WebRequest() for url in urls: r = request.get(url) html = etree.HTML(r.text) tbody = html.xpath('//tbody')[0] proxies = [':'.join([tr[0].text, tr[1].text]) for tr in tbody] proxies = list(set(proxies)) for proxy in proxies: yield proxy
def run(self): urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1'] request = WebRequest() for url in urls: r = request.get(url) proxies = re.findall( 'data-ip="(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})".+?>(\d+)</td>', r.text) for proxy in proxies: yield ':'.join(proxy)
def freeProxySixth(): """ 讯代理 http://www.xdaili.cn/ :return: """ url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10' request = WebRequest() # try: res = request.get(url).json() for row in res['RESULT']['rows']: yield '{}:{}'.format(row['ip'], row['port'])
def freeProxy15(): urls = ['http://www.xiladaili.com/putong/', "http://www.xiladaili.com/gaoni/", "http://www.xiladaili.com/http/", "http://www.xiladaili.com/https/"] request = WebRequest() for url in urls: r = request.get(url, timeout=10) ips = re.findall(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}", r.text) for ip in ips: yield ip.strip()
def getCheckerproxy(): str = datetime.datetime.today().strftime('%Y-%m-%d') url = 'https://checkerproxy.net/api/archive/{}'.format(str) request = WebRequest() try: res = request.get(url, timeout=10).json() for row in res: if row['type'] == 2 and row['kind'] == 2: yield row['addr'] except Exception as e: pass
def freeProxyWallFirst(): """ 墙外网站 cn-proxy :return: """ urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] request = WebRequest() for url in urls: r = request.get(url) proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W]<td>(\d+)</td>', r.text) for proxy in proxies: yield ':'.join(proxy)
def freeProxyTen(): """ 云代理 http://www.ip3366.net/free/ :return: """ urls = ['http://www.ip3366.net/free/'] request = WebRequest() for url in urls: r = request.get(url) proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text) for proxy in proxies: yield ":".join(proxy)
def freeProxyTen(): """ 云代理 http://www.ip3366.net/free/ :return: """ urls = ['http://www.ip3366.net/free/?stype=1&page={}'.format(str(i)) for i in range(1, 4)] request = WebRequest() for url in urls: r = request.get(url) proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text) for proxy in proxies: yield ":".join(proxy)
def freeProxyNinth(): """ 码农代理 https://proxy.coderbusy.com/ 已停用 :return: """ urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1'] request = WebRequest() for url in urls: r = request.get(url, timeout=10) proxies = re.findall('data-ip="(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})".+?>(\d+)</td>', r.text) for proxy in proxies: yield ':'.join(proxy)
def freeProxySecond(proxy_number=100): """ 抓取代理66 http://www.66ip.cn/ :param proxy_number: 代理数量 :return: """ url = "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format( proxy_number) request = WebRequest() html = request.get(url).text for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html): yield proxy
def freeProxyEight(): url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)] # 国内高匿 url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 10)] # 国内普匿 url_gntou = ['http://www.mimiip.com/gntou/%s' % n for n in range(1, 10)] # 国内透明 url_list = url_gngao + url_gnpu + url_gntou request = WebRequest() for url in url_list: r = request.get(url) proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W].*<td>(\d+)</td>', r.text) for proxy in proxies: yield ':'.join(proxy)
def run(self): urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] request = WebRequest() import base64 for url in urls: r = request.get(url) if r.status_code != 200: break proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) for proxy in proxies: yield base64.b64decode(proxy).decode()