def freeProxy09(page_count=4): """ http://ip.jiangxianli.com/?page= 免费代理库 :return: """ url_pattern = [ "http://ip.jiangxianli.com/?anonymity=1&page={}", "http://ip.jiangxianli.com/?anonymity=2&page={}", "http://ip.jiangxianli.com/?protocol=http&page={}", "http://ip.jiangxianli.com/?protocol=https&page={}", ] url_list = [] for page_index in range(1, page_count + 1): for pattern in url_pattern: url_list.append(pattern.format(page_index)) for url in url_list: # for i in range(1, page_count + 1): # url = 'http://ip.jiangxianli.com/?anonymity=2&page={}'.format(i) html_tree = WebRequest().get(url).tree for index, tr in enumerate(html_tree.xpath("//table//tr")): if index == 0: continue yield ":".join(tr.xpath("./td/text()")[0:2]).strip()
def freeProxy01(): """ 米扑代理 https://proxy.mimvp.com/ :return: """ url_list = [ 'https://proxy.mimvp.com/freeopen', 'https://proxy.mimvp.com/freeopen?proxy=in_tp' ] port_img_map = { 'DMxMjg': '3128', 'Dgw': '80', 'DgwODA': '8080', 'DgwOA': '808', 'DgwMDA': '8000', 'Dg4ODg': '8888', 'DgwODE': '8081', 'Dk5OTk': '9999' } for url in url_list: html_tree = WebRequest().get(url).tree for tr in html_tree.xpath( ".//table[@class='mimvp-tbl free-proxylist-tbl']/tbody/tr" ): try: ip = ''.join(tr.xpath('./td[2]/text()')) port_img = ''.join( tr.xpath('./td[3]/img/@src')).split("port=")[-1] port = port_img_map.get(port_img[14:].replace('O0O', '')) if port: yield '%s:%s' % (ip, port) except Exception as e: print(e)
def freeProxy01(): """ 无忧代理 http://www.data5u.com/ 几乎没有能用的 :return: """ url_list = [ 'http://www.data5u.com/', # 'http://www.data5u.com/free/gngn/index.shtml', # 'http://www.data5u.com/free/gnpt/index.shtml' ] key = 'ABCDEFGHIZ' for url in url_list: html_tree = WebRequest().get(url).tree ul_list = html_tree.xpath('//ul[@class="l2"]') for ul in ul_list: try: ip = ul.xpath('./span[1]/li/text()')[0] classnames = ul.xpath('./span[2]/li/attribute::class')[0] classname = classnames.split(' ')[1] port_sum = 0 for c in classname: port_sum *= 10 port_sum += key.index(c) port = port_sum >> 3 yield '{}:{}'.format(ip, port) except Exception as e: print(e)
def freeProxy20(): source = 'premproxy.com' urls = [ 'https://premproxy.com/list/ip-port/1.htm', 'https://premproxy.com/list/ip-port/2.htm', 'https://premproxy.com/list/ip-port/3.htm', ] proxies = {'http': MAINPROXY, 'https': MAINPROXY} for url in urls: tree = WebRequest().get(url, proxies=proxies).tree if tree is None: return None ret = tree.xpath('//ul[@id="ipportlist"]/li') for r in ret: try: ip = r.xpath('./li/text()')[0][:-1] # ip_mask = re.search('(?:")(.*)(?:")', # ip_script).groups()[0] # ip = re.search('(?:>)([0-9\.]+)(?:<)', # unquote(ip_mask, 'utf8')).groups()[0] port = r.xpath('./li/span/text()')[0] protocol = 'https' yield Proxy(f'{protocol}://{ip}:{port}', source=source) except Exception as e: print(type(e), e)
def freeProxy04(): """ 全网代理 http://www.goubanjia.com/ :return: """ url = "http://www.goubanjia.com/" tree = WebRequest().get(url).tree proxy_list = tree.xpath('//td[@class="ip"]') # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号 # 需要过滤掉<p style="display:none;">的内容 xpath_str = """.//*[not(contains(@style, 'display: none')) and not(contains(@style, 'display:none')) and not(contains(@class, 'port')) ]/text() """ # port是class属性值加密得到 def _parse_port(port_element): port_list = [] for letter in port_element: port_list.append(str("ABCDEFGHIZ".find(letter))) _port = "".join(port_list) return int(_port) >> 0x3 for each_proxy in proxy_list: try: ip_addr = ''.join(each_proxy.xpath(xpath_str)) port_str = each_proxy.xpath( ".//span[contains(@class, 'port')]/@class")[0].split()[-1] port = _parse_port(port_str.strip()) yield '{}:{}'.format(ip_addr, int(port)) except Exception: pass
def freeProxy04(): """ guobanjia http://www.goubanjia.com/ :return: """ url = "http://www.goubanjia.com/" tree = WebRequest().get(url).tree proxy_list = tree.xpath('//td[@class="ip"]') # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号 # 需要过滤掉<p style="display:none;">的内容 xpath_str = """.//*[not(contains(@style, 'display: none')) and not(contains(@style, 'display:none')) and not(contains(@class, 'port')) ]/text() """ for each_proxy in proxy_list: try: # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port ip_addr = ''.join(each_proxy.xpath(xpath_str)) # HTML中的port是随机数,真正的端口编码在class后面的字母中。 # 比如这个: # <span class="port CFACE">9054</span> # CFACE解码后对应的是3128。 port = 0 for _ in each_proxy.xpath(".//span[contains(@class, 'port')]" "/attribute::class")[0]. \ replace("port ", ""): port *= 10 port += (ord(_) - ord('A')) port /= 8 yield '{}:{}'.format(ip_addr, int(port)) except Exception as e: pass
def freeProxy01(): """ 米扑代理 https://proxy.mimvp.com/ :return: """ url_list = [ 'https://proxy.mimvp.com/freeopen?proxy=in_hp', 'https://proxy.mimvp.com/freeopen?proxy=out_hp' ] for url in url_list: html_tree = WebRequest().get(url).tree for tr in html_tree.xpath( ".//table[@class='mimvp-tbl free-proxylist-tbl']/tbody/tr" ): try: ip = ''.join(tr.xpath('./td[2]/text()')) port_img_url = 'https://proxy.mimvp.com' + (''.join( tr.xpath('./td[3]/img/@src'))) port_img_name = port_img_url.split('port=')[-1] img_path = os.path.abspath( os.path.join( os.path.dirname(__file__), "../cache/freeopen_port_image/%s.png" % port_img_name)) if not os.path.exists(img_path): urllib.request.urlretrieve(port_img_url, filename=img_path) port = pytesseract.image_to_string(img_path) if port: yield '%s:%s' % (ip, port) except Exception as e: print(e)
def freeProxy04(): """ 蝶鸟IP """ url = "https://www.dieniao.com/FreeProxy.html" tree = WebRequest().get(url, verify=False).tree for li in tree.xpath("//div[@class='free-main col-lg-12 col-md-12 col-sm-12 col-xs-12']/ul/li")[1:]: ip = "".join(li.xpath('./span[1]/text()')).strip() port = "".join(li.xpath('./span[2]/text()')).strip() yield "%s:%s" % (ip, port)
def freeProxy06(): """ PROXY11 https://proxy11.com """ url = "https://proxy11.com/api/demoweb/proxy.json?country=hk&speed=2000" try: resp_json = WebRequest().get(url).json for each in resp_json.get("data", []): yield "%s:%s" % (each.get("ip", ""), each.get("port", "")) except Exception as e: print(e)
def freeProxy03(): """ 开心代理 """ target_urls = ["http://www.kxdaili.com/dailiip.html", "http://www.kxdaili.com/dailiip/2/1.html"] for url in target_urls: tree = WebRequest().get(url).tree for tr in tree.xpath("//table[@class='active']//tr")[1:]: ip = "".join(tr.xpath('./td[1]/text()')).strip() port = "".join(tr.xpath('./td[2]/text()')).strip() yield "%s:%s" % (ip, port)
def freeProxy09(page_count=1): """ 免费代理库 """ for i in range(1, page_count + 1): url = 'http://ip.jiangxianli.com/?country=中国&page={}'.format(i) html_tree = WebRequest().get(url).tree for index, tr in enumerate(html_tree.xpath("//table//tr")): if index == 0: continue yield ":".join(tr.xpath("./td/text()")[0:2]).strip()
def freeProxy12(): urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] request = WebRequest() for url in urls: r = request.get(url, timeout=10) proxies = re.findall( r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text) for proxy in proxies: yield ':'.join(proxy)
def freeProxy02(): """ 代理66 http://www.66ip.cn/ """ url = "http://www.66ip.cn/" resp = WebRequest().get(url, timeout=10).tree for i, tr in enumerate(resp.xpath("(//table)[3]//tr")): if i > 0: ip = "".join(tr.xpath("./td[1]/text()")).strip() port = "".join(tr.xpath("./td[2]/text()")).strip() yield "%s:%s" % (ip, port)
def freeProxy06(): """ FateZero http://proxylist.fatezero.org/ """ url = "http://proxylist.fatezero.org/proxy.list" try: resp_text = WebRequest().get(url).text for each in resp_text.split("\n"): json_info = json.loads(each) if json_info.get("country") == "CN": yield "%s:%s" % (json_info.get("host", ""), json_info.get("port", "")) except Exception as e: print(e)
def freeProxy04(): """ 神鸡代理 http://www.shenjidaili.com/ :return: """ url = "http://www.shenjidaili.com/product/open/" tree = WebRequest().get(url).tree for table in tree.xpath("//table[@class='table table-hover text-white text-center table-borderless']"): for tr in table.xpath("./tr")[1:]: proxy = ''.join(tr.xpath("./td[1]/text()")) yield proxy.strip()
def freeProxy06(): """ 码农代理 https://proxy.coderbusy.com/ :return: """ urls = ['https://proxy.coderbusy.com/'] for url in urls: tree = WebRequest().get(url).tree proxy_list = tree.xpath('.//table//tr') for tr in proxy_list[1:]: yield ':'.join(tr.xpath('./td/text()')[0:2])
def freeProxy10(): """ 墙外网站 cn-proxy :return: """ urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] request = WebRequest() for url in urls: r = request.get(url, timeout=10) proxies = re.findall( r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W]<td>(\d+)</td>', r.text) for proxy in proxies: yield ':'.join(proxy)
def freeProxy05(): """ 快代理 https://www.kuaidaili.com """ url_list = [ 'https://www.kuaidaili.com/free/inha/', 'https://www.kuaidaili.com/free/intr/' ] for url in url_list: tree = WebRequest().get(url).tree proxy_list = tree.xpath('.//table//tr') sleep(1) # 必须sleep 不然第二条请求不到数据 for tr in proxy_list[1:]: yield ':'.join(tr.xpath('./td/text()')[0:2])
def freeProxy06(): """ 代理盒子 https://proxy.coderbusy.com/ :return: """ urls = ['https://proxy.coderbusy.com/zh-hans/ops/country/cn.html'] for url in urls: tree = WebRequest().get(url).tree proxy_list = tree.xpath('.//table//tr') for tr in proxy_list[1:]: proxy = '{}:{}'.format("".join(tr.xpath("./td[1]/text()")).strip(), "".join(tr.xpath("./td[2]//text()")).strip()) if proxy: yield proxy
def freeProxy06(page=2): """ 极速代理 https://www.superfastip.com/ :return: """ url = "https://api.superfastip.com/ip/freeip?page={page}" for i in range(page): page_url = url.format(page=i + 1) try: resp_json = WebRequest().get(page_url).json for each in resp_json.get("freeips", []): yield "%s:%s" % (each.get("ip", ""), each.get("port", "")) except Exception as e: print(e)
def freeProxy06(): """ 码农代理 https://proxy.coderbusy.com/ :return: """ urls = ['https://proxy.coderbusy.com/'] for url in urls: tree = WebRequest().get(url).tree proxy_list = tree.xpath('.//table//tr') for tr in proxy_list[1:]: try: yield ('http://' + tr.xpath('./td[1]/text()')[0] + ':' + tr.xpath('./td[2]/a/text()')[0]) except IndexError: continue
def freeProxy26(): """ https://cool-proxy.net/ :return: """ urls = ['http://cool-proxy.net/proxies.json'] for url in urls: r = WebRequest().get(url, timeout=10) proxy_json = r.json() ips = [] for a in proxy_json: ips += [a.get('ip') + ':' + str(a.get('port'))] for ip in ips: yield ip.strip()
def freeProxy09(page_count=10): """ 免费代理库 """ page = 1 while page <= page_count: url = 'http://ip.jiangxianli.com/?country=中国&page={}'.format(page) html_tree = WebRequest().get(url).tree trs = html_tree.xpath("//table//tr") if len(trs) <= 1: break for index, tr in enumerate(trs): if index == 0: continue yield ":".join(tr.xpath("./td/text()")[0:2]).strip() sleep(1) page += 1
def freeProxy04(): """ FreeProxyList https://www.freeproxylists.net/zh/ """ url = "https://www.freeproxylists.net/zh/?c=CN&pt=&pr=&a%5B%5D=0&a%5B%5D=1&a%5B%5D=2&u=50" tree = WebRequest().get(url, verify=False).tree from urllib import parse def parse_ip(input_str): html_str = parse.unquote(input_str) ips = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', html_str) return ips[0] if ips else None for tr in tree.xpath("//tr[@class='Odd']") + tree.xpath("//tr[@class='Even']"): ip = parse_ip("".join(tr.xpath('./td[1]/script/text()')).strip()) port = "".join(tr.xpath('./td[2]/text()')).strip() if ip: yield "%s:%s" % (ip, port)
def freeProxy30(): """ http://www.xroxy.com/ vpn needed :return: """ urls = [ 'http://www.xroxy.com/proxylist.php?port=&type=&ssl=&country=&latency=&reliability=&' 'sort=reliability&desc=true&pnum=%s#table' % i for i in range(20) ] for url in urls: r = WebRequest().get(url, timeout=10, proxies=proxies) ips = list() html = r.tree infos = list() for x in html.xpath('//tr'): # print(dir(x)) infos += x.cssselect('.row1') + x.cssselect('.row0') # print(infos) for info in infos: proxy_ip = info.cssselect('a')[0].text.replace('\n', '').replace( '\r', '') proxy_port = info.cssselect('a')[1].text proxy_type = info.cssselect('a')[2].text ips.append(proxy_ip + ':' + proxy_port) for ip in ips: yield ip.strip()
def freeProxy34(): import base64 """ http://proxy-list.org/ vpn needed :return: """ urls = [ 'https://proxy-list.org/english/index.php?p=%s' % i for i in range(1, 11) ] for url in urls: r = WebRequest().get(url, timeout=10, proxies=proxies) html = etree.HTML( r.text.replace('<script type="text/javascript">Proxy(\'', '').replace('\')</script>', '')) infos = html.xpath('//*[@id="proxy-table"]/div[2]/div')[0].xpath( 'ul') ips = list() for info in infos: ips.append( base64.b64decode( info.cssselect('li')[0].text).decode('ascii')) for ip in ips: yield ip.strip()
def freeProxy39(): """ https://www.proxynova.com/ vpn needed :return: """ urls = [ 'https://www.proxynova.com/proxy-server-list/country-cn/', 'https://www.proxynova.com/proxy-server-list/' ] for url in urls: r = WebRequest().get(url, timeout=10, proxies=proxies) # print(r.text) infos = etree.HTML( r.text.replace('<script>document.write(\'', '').replace('\');</script>', '').replace('\n', '')).cssselect('tr') print(len(infos)) infos.pop(0) ips = [] for info in infos: if len(info.cssselect('td')) < 3: continue proxy_ip = info.cssselect('td')[0].cssselect( 'abbr')[0].text.replace(' ', '') proxy_port = info.cssselect('td')[1].text.replace(' ', '') proxy = "{0}:{1}".format(proxy_ip, proxy_port) ips.append(proxy) pass for ip in ips: yield ip.strip() pass
def freeProxy16(): proxies = {'http': MAINPROXY, 'https': MAINPROXY} source = 'free-proxy.cz' urls = [ 'http://free-proxy.cz/en/proxylist/country/all/socks5/ping/all', 'http://free-proxy.cz/en/proxylist/country/all/socks5/ping/all/2', 'http://free-proxy.cz/en/proxylist/country/all/socks5/ping/all/3', 'http://free-proxy.cz/en/proxylist/country/all/socks5/ping/all/4', 'http://free-proxy.cz/en/proxylist/country/all/socks5/ping/all/5', ] for url in urls: r = WebRequest().get(url, proxies=proxies) if r.response.status_code == 200: ret = r.tree for tr in ret.xpath('//table[@id="proxy_list"]//tr')[1:]: try: ip_script = tr.xpath('./td[1]/script/text()')[0] ip_base64 = re.search('(?:")([\w=]+)(?:")', ip_script).groups()[0] ip = base64.b64decode(ip_base64).decode('utf8') port = tr.xpath('./td[2]/span/text()')[0] protocol = ''.join(tr.xpath('./td[3]/small/text()')) yield Proxy(f'{protocol}://{ip}:{port}', source=source) except Exception as e: print(e)
def freeProxy11(): """ https://proxy-list.org/english/index.php :return: """ urls = [ 'https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10) ] request = WebRequest() import base64 for url in urls: r = request.get(url, timeout=10) proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) for proxy in proxies: yield base64.b64decode(proxy).decode()
def freeProxy16(): url = 'http://raw.staticdn.net/fate0/proxylist/master/proxy.list' r = WebRequest().get(url, timeout=10) ips = [item for item in r.text.split('\n')][0:-1] ips = [json.loads(item, strict=False) for item in ips] for ip in ips: yield str(ip['host']) + str(ip['port'])