def freeProxyRay01(page_count=1): """ http://www.xsdaili.com/dayProxy/2020/1/1.html 免费代理库 /html/body/div[5]/div/div[2]/div/div/div/div[2]/div/div[2]/div[1]/div[1]/a :return: """ try: base_url = 'http://www.xsdaili.com' nowt = time.localtime(time.time()) url = '{}/dayProxy/{}/{}/1.html'.format(base_url, nowt.tm_year, nowt.tm_mon) html_tree = getHtmlTree(url) today_urls = [] for aurl in html_tree.xpath("//a"): if isinstance(aurl.text, str) and "代理IP" in aurl.text and time.strftime( "%Y年%m月%d日", nowt) in aurl.text: today_urls.append('/'.join( [base_url.strip('/'), aurl.attrib['href'].strip('/')])) for purl in today_urls: html_tree = getHtmlTree(purl) for abr in html_tree.xpath("//br"): try: searchObj = re.search( '[1-9]{1}[0-9]{0,2}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}:[0-9]{1,5}', abr.tail.strip()) if searchObj: yield searchObj.group() except Exception: pass except Exception: pass
def freeProxyNinth(): """ 站大爷代理 http://ip.zdaye.com/ :return: """ url = 'http://ip.zdaye.com/' html_tree = getHtmlTree(url) item_list = html_tree.xpath('//div[@class="Loglist"]/div[2]/div[@class="panel-body"]//a/text()') for item in item_list: try: yield item.split('@')[0].strip() except Exception as e: print(e) header = { 'Referer': 'http://ip.zdaye.com/', } new_urls = html_tree.xpath('//div[@class="Loglist"]/div[1]/div[@class="panel-body"]//a/@href') for new_url in new_urls: try: new_html_tree = getHtmlTree(url + new_url, header=header) new_item_list = new_html_tree.xpath('//div[@class="cont"]/text()') for new_item in new_item_list: try: yield new_item.split('@')[0].strip() except Exception as e: print(e) except Exception as e: print(e)
def freeProxyEight(): """ 小舒代理 http://www.xsdaili.com/ """ url = 'http://www.xsdaili.com/' html_tree = getHtmlTree(url) new_url = url + html_tree.xpath('//div[@class="col-md-12"]/div[1]//a[1]/@href')[0] new_html_tree = getHtmlTree(new_url) proxy_list = new_html_tree.xpath('//div[@class="cont"]/text()') for proxy in proxy_list: try: yield proxy.split('@')[0].strip() except Exception as e: print(e)
def freeProxyFifth(): """ 抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml :return: """ url = "http://www.goubanjia.com/free/gngn/index{page}.shtml" for page in range(1, 10): page_url = url.format(page=page) tree = getHtmlTree(page_url) proxy_list = tree.xpath('//td[@class="ip"]') # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号 # 需要过滤掉<p style="display:none;">的内容 xpath_str = """.//*[not(contains(@style, 'display: none')) and not(contains(@style, 'display:none')) and not(contains(@class, 'port')) ]/text() """ for each_proxy in proxy_list: try: # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port ip_addr = ''.join(each_proxy.xpath(xpath_str)) port = each_proxy.xpath(".//span[contains(@class, 'port')]/text()")[0] yield '{}:{}'.format(ip_addr, port) except Exception as e: pass
def freeProxyFirst(page=10): """ 无忧代理 http://www.data5u.com/ 无忧代理有反爬虫机制。 需要获得元素的 classname。 匹配classname中每个字符在key中的位置,组合得到一个整数。 最后将整数右移3位得到的才是正确的端口号。 :param page: 页数 :return: """ url_list = [ 'http://www.data5u.com/', 'http://www.data5u.com/free/gngn/index.shtml', 'http://www.data5u.com/free/gnpt/index.shtml' ] key = 'ABCDEFGHIZ' for url in url_list: html_tree = getHtmlTree(url) ul_list = html_tree.xpath('//ul[@class="l2"]') for ul in ul_list: try: ip = ul.xpath('./span[1]/li/text()')[0] classnames = ul.xpath('./span[2]/li/attribute::class')[0] classname = classnames.split(' ')[1] port_sum = 0 for c in classname: port_sum *= 10 port_sum += key.index(c) port = port_sum >> 3 yield '{}:{}'.format(ip, port) except Exception as e: print(e)
def freeProxyFifth(): """ 抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml :return: """ url = "http://www.goubanjia.com/free/gngn/index{page}.shtml" for page in range(1, 10): page_url = url.format(page=page) tree = getHtmlTree(page_url) proxy_list = tree.xpath('//td[@class="ip"]') # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号 # 需要过滤掉<p style="display:none;">的内容 xpath_str = """.//*[not(contains(@style, 'display: none')) and not(contains(@style, 'display:none')) and not(contains(@class, 'port')) ]/text() """ for each_proxy in proxy_list: try: # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port ip_addr = ''.join(each_proxy.xpath(xpath_str)) port = each_proxy.xpath( ".//span[contains(@class, 'port')]/text()")[0] yield '{}:{}'.format(ip_addr, port) except Exception as e: self.log.warning("fetch proxy failed: " + str(e))
def freeProxy04(): """ guobanjia http://www.goubanjia.com/ :return: """ url = "http://www.goubanjia.com/" tree = getHtmlTree(url) proxy_list = tree.xpath('//td[@class="ip"]') # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号 # 需要过滤掉<p style="display:none;">的内容 xpath_str = """.//*[not(contains(@style, 'display: none')) and not(contains(@style, 'display:none')) and not(contains(@class, 'port')) ]/text() """ for each_proxy in proxy_list: try: # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port ip_addr = ''.join(each_proxy.xpath(xpath_str)) # HTML中的port是随机数,真正的端口编码在class后面的字母中。 # 比如这个: # <span class="port CFACE">9054</span> # CFACE解码后对应的是3128。 port = 0 for _ in each_proxy.xpath(".//span[contains(@class, 'port')]" "/attribute::class")[0]. \ replace("port ", ""): port *= 10 port += (ord(_) - ord('A')) port /= 8 yield '{}:{}'.format(ip_addr, int(port)) except Exception as e: pass
def free_proxy_xici(): """ 西刺代理 http://www.xicidaili.com :return: """ url_list = [{ "url": "http://www.xicidaili.com/nn/", "page_count": 10 }, { "url": "http://www.xicidaili.com/nt/", "page_count": 10 }, { "url": "http://www.xicidaili.com/wn/", "page_count": 10 }] for task in url_list: each_url = task['url'] page_count = task['page_count'] for i in range(1, page_count + 1): page_url = each_url + str(i) tree = getHtmlTree(page_url) proxy_list = tree.xpath( './/table[@id="ip_list"]//tr[position()>1]') for proxy in proxy_list: try: yield ':'.join(proxy.xpath('./td/text()')[0:2]) except Exception as e: pass
def freeProxy01(): """ 无忧代理 http://www.data5u.com/ 几乎没有能用的 :return: """ url_list = [ 'http://www.data5u.com/', 'http://www.data5u.com/free/gngn/index.shtml', 'http://www.data5u.com/free/gnpt/index.shtml' ] key = 'ABCDEFGHIZ' for url in url_list: html_tree = getHtmlTree(url) ul_list = html_tree.xpath('//ul[@class="l2"]') for ul in ul_list: try: ip = ul.xpath('./span[1]/li/text()')[0] classnames = ul.xpath('./span[2]/li/attribute::class')[0] classname = classnames.split(' ')[1] port_sum = 0 for c in classname: port_sum *= 10 port_sum += key.index(c) port = port_sum >> 3 yield '{}:{}'.format(ip, port) except Exception as e: print(e)
def freeProxy_xiladaili(): urls = [ 'http://www.xiladaili.com/gaoni/%d/', 'http://www.xiladaili.com/http/%d/', 'http://www.xiladaili.com/https/%d/' ] for url in urls: # 每个url抓取20页 for i in range(1, 21): new_url = url % i dom = getHtmlTree(new_url) for item in dom.xpath('//tr'): ip = item.xpath('./td[1]/text()') if not len(ip): continue ip = item.xpath('./td[1]/text()')[0] protocol = item.xpath('./td[2]/text()')[0] if "," in protocol: # 支持http 和 https yield 'http://' + ip.strip() yield 'https://' + ip.strip() elif "HTTPS" in protocol: yield 'https://' + ip.strip() else: yield 'http://' + ip.strip()
def run(self): url = 'http://www.89ip.cn/tqdl.html?num=2500&address=&kill_address=&port=&kill_port=&isp=' html_tree = getHtmlTree(url) data_warp = html_tree.xpath( "//div[@class='fly-panel']/div[@style='padding-left:20px;']//text()" ) for data in data_warp: if ':' in data: yield data.strip()
def freeProxyTwelve(page_count=8): for i in range(1, page_count + 1): url = 'http://ip.jiangxianli.com/?page={}'.format(i) html_tree = getHtmlTree(url) tr_list = html_tree.xpath("/html/body/div[1]/div/div[1]/div[2]/table/tbody/tr") if len(tr_list) == 0: continue for tr in tr_list: yield tr.xpath("./td[2]/text()")[0] + ":" + tr.xpath("./td[3]/text()")[0]
def freeProxyThird(days=1): url = 'http://www.ip181.com/' html_tree = getHtmlTree(url) try: tr_list = html_tree.xpath('//tr')[1:] for tr in tr_list: yield ':'.join(tr.xpath('./td/text()')[0:2]) except Exception as e: pass
def freeProxyFifth(): """ 抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml :return: """ url = "http://www.goubanjia.com/free/gngn/index.shtml" tree = getHtmlTree(url) proxy_list = tree.xpath('.//td[@class="ip"]') for proxy in proxy_list: yield ''.join(proxy.xpath('.//text()'))
def freeProxyKuaidaili(): # 快代理http://www.kuaidaili.com/free/inha/1/ url = "http://www.kuaidaili.com/free/inha/{}/" for page in range(1, 10): page_url = url.format(page) tree = getHtmlTree(page_url) ip_list = tree.xpath('//td[@data-title="IP"]/text()') port_list = tree.xpath('//td[@data-title="PORT"]/text()') for index, ip in enumerate(ip_list): yield '{}:{}'.format(ip, port_list[index])
def freeProxyThirteen(): """ 飞蚁代理 http://www.feiyiproxy.com/?page_id=1457 :return: """ url = 'http://www.feiyiproxy.com/?page_id=1457' html_tree = getHtmlTree(url) tr_list = html_tree.xpath('//div[@class="et_pb_code et_pb_module et_pb_code_1"]//tr[position()>1]') for tr in tr_list: yield tr.xpath('./td[1]/text()')[0].strip() + ':' + tr.xpath('./td[2]/text()')[0].strip()
def freeProxySeventh(): """ 快代理免费https://www.kuaidaili.com/free/inha/1/ """ url = 'https://www.kuaidaili.com/free/inha/{page}/' for page in range(1, 10): page_url = url.format(page=page) tree = getHtmlTree(page_url) proxy_list = tree.xpath('.//table//tr') for tr in proxy_list[1:]: yield ':'.join(tr.xpath('./td/text()')[0:2])
def run(self): end = 10 url = 'http://www.nimadaili.com/{col}/{page}/' col_list = ['gaoni', 'http', 'https'] for col in col_list: for i in range(1, end): html_tree = getHtmlTree(url.format(page=i, col=col)) data_list = html_tree.xpath('//tr//td[1]//text()') for data in data_list: if ':' in data: yield data.strip()
def freeProxyThird(self, days=1): """ 抓取ip181 http://www.ip181.com/ :param days: :return: """ url = 'http://www.ip181.com/' html_tree = getHtmlTree(url) tr_list = html_tree.xpath('//tr')[1:] for tr in tr_list: yield ':'.join(tr.xpath('./td/text()')[0:2])
def freeProxyFourteen(): """ 旗云代理 http://www.qydaili.com/free/?action=china&page= :return: """ urls = ['http://www.qydaili.com/free/?action=china&page={}'.format(page) for page in range(1, 4)] for url in urls: html_tree = getHtmlTree(url) tr_list = html_tree.xpath('//table[@class="table table-bordered table-striped"]//tbody//tr') for tr in tr_list: yield tr.xpath('./td[1]/text()')[0].strip() + ':' + tr.xpath('./td[2]/text()')[0].strip()
def freeProxy06(): """ 码农代理 https://proxy.coderbusy.com/ :return: """ urls = ['https://proxy.coderbusy.com/'] for url in urls: tree = getHtmlTree(url) proxy_list = tree.xpath('.//table//tr') for tr in proxy_list[1:]: yield ':'.join(tr.xpath('./td/text()')[0:2])
def freeProxySecond(area=33, page=1): area = 33 if area > 33 else area for area_index in range(1, area + 1): for i in range(1, page + 1): url = "http://www.66ip.cn/areaindex_{}/{}.html".format(area_index, i) html_tree = getHtmlTree(url) tr_list = html_tree.xpath("//*[@id='footer']/div/table/tr[position()>1]") if len(tr_list) == 0: continue for tr in tr_list: yield tr.xpath("./td[1]/text()")[0] + ":" + tr.xpath("./td[2]/text()")[0] break
def freeProxySeventh(): url_list = [ 'https://www.kuaidaili.com/free/inha/{page}/', 'https://www.kuaidaili.com/free/intr/{page}/' ] for url in url_list: for page in range(1, 5): page_url = url.format(page=page) tree = getHtmlTree(page_url) proxy_list = tree.xpath('.//table//tr') for tr in proxy_list[1:]: yield ':'.join(tr.xpath('./td/text()')[0:2])
def freeProxyFifth(self): """ 抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml :return: """ url = "http://www.goubanjia.com/free/gngn/index{page}.shtml" for page in range(1, 10): page_url = url.format(page=page) tree = getHtmlTree(page_url) proxy_list = tree.xpath('//td[@class="ip"]') for each_proxy in proxy_list: yield ''.join(each_proxy.xpath('.//text()'))
def freeProxyThirteen(): """ https://ip.ihuan.me/address/5Lit5Zu9.html 反爬严厉 免费代理库 超多量 :return: """ url_list = ['https://ip.ihuan.me/'] for url in url_list: tree = getHtmlTree(url) proxy_list = tree.xpath('.//table/tbody/tr') for tr in proxy_list[1:]: yield ':'.join(tr.xpath('./td/text()')[0:2])
def freeProxyFifth(): """ 抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml :return: """ url = "http://www.goubanjia.com/free/gngn/index.shtml" tree = getHtmlTree(url) # 现在每天最多放15个(一页) for i in xrange(15): d = tree.xpath( './/table[@class="table"]/tbody/tr[{}]/td'.format(i + 1))[0] o = d.xpath('.//span/text() | .//div/text()') yield ''.join(o[:-1]) + ':' + o[-1]
def freeProxyProxydb(): # Proxydb http://proxydb.net/?protocol=http&protocol=https&country=&offset=0 url = "http://proxydb.net/?protocol=http&country=&offset={}" for offset in range(0, 150, 15): page_url = url.format(offset) tree = getHtmlTree(page_url) proxy_list = tree.xpath('//table//script/text()') for item in proxy_list: list = re.split('=|;|\'', "".join(item.split())) ip = list[2][::-1] + list[10] port = eval(list[13]) yield '{}:{}'.format(ip, port) time.sleep(2)
def freeProxyFourth(): """ 抓取西刺代理 http://api.xicidaili.com/free2016.txt :return: """ url_list = ['http://www.xicidaili.com/nn', # 高匿 'http://www.xicidaili.com/nt', # 透明 ] for each_url in url_list: tree = getHtmlTree(each_url) proxy_list = tree.xpath('.//table[@id="ip_list"]//tr') for proxy in proxy_list: yield ':'.join(proxy.xpath('./td/text()')[0:2])
def freeProxyPingRui(): # PingRui url_list = ['http://pingrui.net/wn/', 'http://pingrui.net/wt/'] for each_url in url_list: tree = getHtmlTree(each_url) proxy_list = tree.xpath('.//table[@id="ip_list"]//tr') for proxy in proxy_list: try: yield ':'.join(proxy.xpath('./td/text()')[0:2]) except Exception as e: pass time.sleep(2)
def freeProxy09(page_count=1): """ http://ip.jiangxianli.com/?page= 免费代理库 :return: """ for i in range(1, page_count + 1): url = 'http://ip.jiangxianli.com/?country=中国&?page={}'.format(i) html_tree = getHtmlTree(url) for index, tr in enumerate(html_tree.xpath("//table//tr")): if index == 0: continue yield ":".join(tr.xpath("./td/text()")[0:2]).strip()
def freeProxySeventh(): """ 快代理 https://www.kuaidaili.com """ url_list = [ 'https://www.kuaidaili.com/free/inha/', 'https://www.kuaidaili.com/free/intr/' ] for url in url_list: tree = getHtmlTree(url) proxy_list = tree.xpath('.//table//tr') for tr in proxy_list[1:]: yield ':'.join(tr.xpath('./td/text()')[0:2])
def freeProxyThird(days=1): """ 抓取ip181 http://www.ip181.com/ :param days: :return: """ url = 'http://www.ip181.com/' html_tree = getHtmlTree(url) try: tr_list = html_tree.xpath('//tr')[1:] for tr in tr_list: yield ':'.join(tr.xpath('./td/text()')[0:2]) except Exception as e: pass
def freeProxyFourth(): """ 抓取西刺代理 http://api.xicidaili.com/free2016.txt :return: """ url_list = ['http://www.xicidaili.com/nn', # 高匿 'http://www.xicidaili.com/nt', # 透明 ] for each_url in url_list: tree = getHtmlTree(each_url) proxy_list = tree.xpath('.//table[@id="ip_list"]//tr') for proxy in proxy_list: try: yield ':'.join(proxy.xpath('./td/text()')[0:2]) except Exception as e: pass
def freeProxyFirst(page=10): """ 抓取无忧代理 http://www.data5u.com/ :param page: 页数 :return: """ url_list = ['http://www.data5u.com/', 'http://www.data5u.com/free/', 'http://www.data5u.com/free/gngn/index.shtml', 'http://www.data5u.com/free/gnpt/index.shtml'] for url in url_list: html_tree = getHtmlTree(url) ul_list = html_tree.xpath('//ul[@class="l2"]') for ul in ul_list: try: yield ':'.join(ul.xpath('.//li/text()')[0:2]) except Exception as e: pass
def freeProxySecond(area=33, page=1): """ 代理66 http://www.66ip.cn/ :param area: 抓取代理页数,page=1北京代理页,page=2上海代理页...... :param page: 翻页 :return: """ area = 33 if area > 33 else area for area_index in range(1, area + 1): for i in range(1, page + 1): url = "http://www.66ip.cn/areaindex_{}/{}.html".format(area_index, i) html_tree = getHtmlTree(url) tr_list = html_tree.xpath("//*[@id='footer']/div/table/tr[position()>1]") if len(tr_list) == 0: continue for tr in tr_list: yield tr.xpath("./td[1]/text()")[0] + ":" + tr.xpath("./td[2]/text()")[0] break
def freeProxyFourth(page_count=2): """ 西刺代理 http://www.xicidaili.com :return: """ url_list = [ 'http://www.xicidaili.com/nn/', # 高匿 'http://www.xicidaili.com/nt/', # 透明 ] for each_url in url_list: for i in range(1, page_count + 1): page_url = each_url + str(i) tree = getHtmlTree(page_url) proxy_list = tree.xpath('.//table[@id="ip_list"]//tr[position()>1]') for proxy in proxy_list: try: yield ':'.join(proxy.xpath('./td/text()')[0:2]) except Exception as e: pass