def crawl_data5u(self): start_url = 'http://www.data5u.com/free/gngn/index.shtml' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86', 'Host': 'www.data5u.com', 'Referer': 'http://www.data5u.com/free/index.shtml', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36', } html = get_page(start_url, options=headers) if html: ip_address = re.compile( '<span><li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class=\"port.*?>(\d+)</li>', re.S) re_ip_address = ip_address.findall(html) for address, port in re_ip_address: result = address + ':' + port yield result.replace(' ', '')
def crawl_goubanjia(self): start_url = 'http://www.goubanjia.com/' html = get_page(start_url) if html: doc = pq(html) tds = doc('td.ip').items() for td in tds: td.find('p').remove() yield td.text().replace(' ', '')
def crawl_ip3366(self): for page in range(1, 4): start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format( page) html = get_page(start_url) ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>') # \s * 匹配空格,起到换行作用 re_ip_address = ip_address.findall(html) for address, port in re_ip_address: result = address + ':' + port yield result.replace(' ', '')
def crawl_ip3366(self): for i in range(1, 4): start_url = "http://www.ip3366.net/?stype=1&page={}".format(i) html_str = get_page(start_url) if html_str: html = etree.HTML(html_str) tr_list = html.xpath("//*[@id='list']/table//tbody/tr") for tr in tr_list: ip = tr.xpath("./td[1]/text()")[0] port = tr.xpath("./td[2]/text()")[0] result = ip + ":" + port yield result.replace(' ', '')
def crawl_kuaidaili(self): for i in range(1, 4): start_url = 'http://www.kuaidaili.com/free/inha/{}/'.format(i) html = get_page(start_url) if html: ip_address = re.compile('<td data-title="IP">(.*?)</td>') re_ip_address = ip_address.findall(html) port = re.compile('<td data-title="PORT">(.*?)</td>') re_port = port.findall(html) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port yield address_port.replace(' ', '')
def crawl_xicidaili(self): for i in range(1, 3): start_url = "https://www.xicidaili.com/nn/{}".format(i) html_str = get_page(start_url) if html_str: html = etree.HTML(html_str) tr_list = html.xpath("//*[@id='ip_list']//tr")[1:] for tr in tr_list: ip = tr.xpath(".//td[2]/text()")[0] port = tr.xpath(".//td[3]/text()")[0] yield ":".join([ip, port])
def get_raw_proxies(self, page_count=5): start_url = 'http://www.66ip.cn/{}.html' urls = [start_url.format(page) for page in range(1, page_count + 1)] for url in urls: print('Crawling', url) html = get_page(url) if html: doc = pq(html) trs = doc('.containerbox table tr:gt(0)').items() for tr in trs: ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() yield ':'.join([ip, port])
def crawl_89ip(self): for i in range(1, 8): start_url = 'http://www.89ip.cn/index_{}.html'.format(i) html = get_page(start_url) if html: find_address_ports = re.compile( '(\d+\.\d+\.\d+\.\d+).*?</td>.*?<td>.*?(\d+).*?</td>', re.S) address_ports = find_address_ports.findall(html) for address, port in address_ports: result = address + ":" + port yield result time.sleep(5)
def crawl_goubanjia(self): """ 获取Goubanjia :return: 代理 """ start_url = 'http://www.goubanjia.com/free/gngn/index.shtml' html = get_page(start_url) if html: doc = pq(html) tds = doc('td.ip').items() for td in tds: td.find('p').remove() yield td.text().replace(' ','')
def crawl_ip3366(self): for i in range(1, 4): start_url = 'http://www.ip3366.net/?stype=1&page={}'.format(i) html = get_page(start_url) if html: find_tr = re.compile('<tr>(.*?)</tr>', re.S) trs = find_tr.findall(html) for s in range(1, len(trs)): find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>') re_ip_address = find_ip.findall(trs[s]) find_port = re.compile('<td>(\d+)</td>') re_port = find_port.findall(trs[s]) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port yield address_port.replace(' ', '')
def crawl_proxy360(self): """ 获取Proxy360 :return: 代理 """ start_url = 'http://www.proxy360.cn/Region/China' print('Crawling',start_url) html = get_page(start_url) if html: doc = pq(html) lines = doc('div[name="list_proxy_ip"]').items() for line in lines: ip = line.find('.tbBottomLine:nth-child(1)').text() port = line.find('.tbBottom:nth-child(2)').text() yield ':'.join([ip,port])
def crawl_iphai(self): start_url = 'http://www.iphai.com/' html = get_page(start_url) if html: find_tr = re.compile('<tr>(.*?)</tr>', re.S) trs = find_tr.findall(html) for s in range(1, len(trs)): find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S) re_ip_address = find_ip.findall(trs[s]) find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S) re_port = find_port.findall(trs[s]) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port yield address_port.replace(' ', '')
def crawl(page=3): crawl_url = 'https://www.kuaidaili.com/free/inha/{}/' proxies = list() for i in range(1, page): soup = get_page(crawl_url.format(i)) if soup: trs = soup.find('table', {'class': 'table table-bordered table-striped'}).find('tbody') for tr in trs.find_all('tr'): tmp = tr.find_all('td') proxy = ':'.join([tmp[0].get_text(), tmp[1].get_text()]) print('crawl proxy...', proxy) proxies.append(proxy) time.sleep(1) return proxies
def crawl_iphai(self): start_url = "http://www.iphai.com/" html = get_page(start_url) if html: find_tr = re.compile('<tr>(.*?)</tr>', re.S) trs = find_tr.findall(html) for tr in range(1, len(trs)): find_ip = re.compile('<td>\s*(\d+\.\d+\.\d+\.\d+)\s*</td>', re.S) find_port = re.compile('<td>\s*(\d+)\s*</td>', re.S) re_ip = find_ip.findall(trs[tr]) re_port = find_port.findall(trs[tr]) for ip, port in zip(re_ip, re_port): ip_port = ip + ":" + port yield ip_port.replace(' ', '')
def crawl_deili66(self,page_count=4): """ 获取代理66 :param page_count: 页码 :return: 代理 """ start_url = 'http://www.66ip.cn/{}.html' urls = [start_url.format(page) for page in range(1,page_count + 1)] for url in urls: print('Crawling',url) html = get_page(url) if html: doc = pq(html) trs = doc('.containerbox table tr:gt(0)').items() for tr in trs: ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() yield ':'.join([ip,port])
def crawl_ip336(self): for page in range(1, 4): start_url = "http://www.ip3366.net/free/?stype=1&page={}".format( page) html_str = get_page(start_url) # if html_str: # html = etree.HTML(html_str) # tr_list = html.xpath('//*[@id="list"]/table/tbody/tr') # for tr in tr_list: # ip = tr.xpath(".//td[1]/text()")[0] # port = tr.xpath(".//td[2]/text()")[0] # yield ":".join([ip, port]) ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>') # \s * 匹配空格,起到换行作用 re_ip_address = ip_address.findall(html_str) for ip, port in re_ip_address: result = ip + ":" + port yield result.replace(' ', '')
def crawl_ip3366(self): """ 获取ip3366 :return: 代理 """ for page in range(1, 6): start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format( page) headers = { 'Host': 'www.ip3366.net', 'Upgrade-Insecure-Requests': '1', } time.sleep(5) html = get_page(start_url, headers) if html: ip_address = re.compile( '<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>', re.S) # \s * 匹配空格,起到换行作用 ip_addresses = ip_address.findall(html) for address, port in ip_addresses: result = address + ":" + port yield result.replace(" ", "")
def crawl_xicidaili(self): for i in range(1, 10): start_url = 'http://www.xicidaili.com/nn/{}'.format(i) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Host': 'www.xicidaili.com', 'Referer': 'http://www.xicidaili.com/nn/3', 'Upgrade-Insecure-Requests': '1', } html = get_page(start_url, options=headers) if html: find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S) trs = find_trs.findall(html) for tr in trs: find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>') re_ip_address = find_ip.findall(tr) find_port = re.compile('<td>(\d+)</td>') re_port = find_port.findall(tr) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port yield address_port.replace(' ', '') time.sleep(5)
def crwal_kuaidaili(self): headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Cookie": "channelid=0; sid=1552453215605710; _ga=GA1.2.1976991072.1552454708; _gid=GA1.2.1280637029.1553239774; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1552454708,1553239774; _gat=1; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1553241731", 'Host': 'www.kuaidaili.com', "Accept-Language": "zh-CN,zh;q=0.9", 'Referer': 'https://www.kuaidaili.com/free/inha/3', 'Upgrade-Insecure-Requests': '1', } for i in range(1, 4): start_url = 'https://www.kuaidaili.com/free/inha/{}/'.format(i) html = get_page(start_url, options=headers) if html: ip = re.compile('<td data-title="IP">(.*?)</td>') port = re.compile('<td data-title="PORT">(.*?)</td>') re_ip = ip.findall(html) re_port = port.findall(html) for ip, port in zip(re_ip, re_port): result = ip + ":" + port yield result.replace(' ', '')
def crawl_xicidaili(self): for i in range(1, 3): start_url = 'http://www.xicidaili.com/nn/{}'.format(i) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3', 'Host': 'www.xicidaili.com', 'Referer': 'http://www.xicidaili.com/nn/3', 'Upgrade-Insecure-Requests': '1', } html = get_page(start_url, options=headers) if html: find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S) trs = find_trs.findall(html) for tr in trs: find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>') re_ip_address = find_ip.findall(tr) find_port = re.compile('<td>(\d+)</td>') re_port = find_port.findall(tr) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port yield address_port.replace(' ', '')