class TXTIPPage(object): def __init__(self): self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url_list = [ 'https://www.rmccurdy.com/scripts/proxy/output/http/ALL', 'https://www.rmccurdy.com/scripts/proxy/output/socks/ALL', 'https://www.rmccurdy.com/scripts/proxy/proxylist.txt', 'http://www.proxylists.net/http_highanon.txt', 'http://ab57.ru/downloads/proxyold.txt' ] def run(self): for url in self.url_list: data = self.getter.rget_data(url) ip_list = re.findall('\d+\.\d+\.\d+\.\d+:\d+', data) temp_l = [[ipport.split(":")[0], ipport.split(":")[1]] for ipport in ip_list] sql_list = list() for temp in temp_l: ip = temp[0] port = temp[1] if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, 'http')) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) self.cm.close()
class YouDaiLi(object): def __init__(self): self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url = "http://www.youdaili.net/Daili/http/" def parser(self): url = etree.HTML(self.getter.rget_data( self.url)).xpath('//div[@class="chunlist"]/ul/li[1]/p/a/@href')[0] time.sleep(2) html = self.getter.rget_data(url) soup = BeautifulSoup(html, 'lxml') p_tag = soup.find_all('p') sql_list = list() for p in p_tag: ip_list = re.findall( '(.*?) ———— (.*?) ———— (.*?) ———— (.*?) ', p.get_text()) if ip_list: # [('61.130.226.39', '20753', '浙江湖州', 'HTTPS')] ip = ip_list[0][0] port = ip_list[0][1] lx = ip_list[0][3] if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, lx)) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 # print(sql) self.cm.exe(sql) self.cm.close()
class KuaiDaiLi(object): def __init__(self): """快代理的爬虫""" self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') def parser(self, total_url, xpath_str, format_url): total = int( etree.HTML(self.getter.rget_data(total_url)).xpath(xpath_str)[0]) time.sleep(2) for pageNum in range(1, total): url = format_url.format(pageNum) # 拼接url try: html = self.getter.rget_data(url) # 访问页面 except Exception as e: print("出先错误为{}".format(e)) continue time.sleep(2) # 睡两秒,防止被干掉 soup = BeautifulSoup(html, 'lxml') proxy_list = soup.find( 'table', { 'class': 'table table-bordered table-striped' }).find('tbody') sql_list = list() for proxy in proxy_list.find_all('tr'): tmp = proxy.find_all('td') ip = tmp[0].get_text() port = tmp[1].get_text() lx = tmp[3].get_text().lower() if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, lx)) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) def run(self): # 获取国内高匿部分 self.parser("https://www.kuaidaili.com/free/inha/1/", '//div[@id="listnav"]/ul/li[last()-1]/a/text()', "https://www.kuaidaili.com/free/inha/{}/") # 获取国内普通部分 time.sleep(3) self.parser("https://www.kuaidaili.com/free/intr/1/", '//div[@id="listnav"]/ul/li[last()-1]/a/text()', "https://www.kuaidaili.com/free/intr/{}/") self.cm.close() # 关闭数据库连接
class ListProxy(object): def __init__(self): self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url = "https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-{}" def parser(self): total = int(etree.HTML(self.getter.rget_data(self.url.format(1))).xpath('//div[@id="page"]/table[3]/tr/td[1]/a[last()]/text()')[0].strip('[').strip(']')) time.sleep(3) for pageNum in range(1, total): url = self.url.format(pageNum) # 拼接url try: html = self.getter.rget_data(url) # 访问页面 except Exception as e: print("出先错误为{}".format(e)) continue time.sleep(3) # 睡两秒,防止被干掉 soup = BeautifulSoup(html, 'lxml') proxy_list = soup.find("table", 'bg').find_all('tr') sql_list = list() for proxy in proxy_list: tmp = proxy.find_all("td") if tmp: ip = tmp[1].get_text() port = tmp[2].get_text() lx = tmp[6].get_text() if lx == "yes": lx = 'https' else: lx = 'http' if not self.bf.isContains(ip): sql_list.append("""INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""".format(ip, port, lx)) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) self.cm.close()
class SixSixIP(object): def __init__(self): """66ip代理的爬虫""" self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url = "http://www.66ip.cn/{}.html" def run(self): total = int( etree.HTML( self.getter.rget_data("http://www.66ip.cn/1.html")).xpath( '//div[@id="PageList"]/a[last()-1]/text()')[0]) time.sleep(3) # for pageNum in range(1, total): # for pageNum in range(1176, total): for pageNum in range(1200, total): url = self.url.format(pageNum) # 拼接url try: html = self.getter.rget_data(url) # 访问页面 except Exception as e: print("出先错误为{}".format(e)) continue time.sleep(3) # 睡两秒,防止被干掉 soup = BeautifulSoup(html, 'lxml') proxy_list = soup.find('table', {"border": "2px"}) sql_list = list() for proxy in proxy_list.find_all('tr')[1:]: ip = proxy.find_all('td')[0].get_text() # 获取ip port = proxy.find_all('td')[1].get_text() # 获取端口 if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, 'http')) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库) self.cm.exe(sql) self.cm.close()
class Data5U(object): def __init__(self): """data5u代理的爬虫""" self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url = "http://www.data5u.com/free/index.shtml" def parser(self): html = self.getter.rget_data(self.url) soup = BeautifulSoup(html, "lxml") proxy_list = soup.find_all('ul', {'class': "l2"}) sql_list = list() # 一次性操作数据库 for proxy in proxy_list: tmp = proxy.find_all('li') ip = tmp[0].get_text() port_zimu = list(tmp[1].attrs.values())[0][1] lx = tmp[3].get_text() port = self.mk_port(port_zimu) # 对ip进行布隆去重 if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, lx)) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) self.cm.close() # 关闭数据库连接 def mk_port(self, port_word): word = list(port_word) num_list = [] for item in word: num = 'ABCDEFGHIZ'.find(item) num_list.append(str(num)) port = int("".join(num_list)) >> 0x3 return port
class IP181(object): def __init__(self): self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url = 'http://www.ip181.com/' def parser(self): js_data = self.getter.rget_data(self.url) sql_list = list() # 一次性操作数据库 for proxy in json.loads(js_data).get("RESULT"): ip = proxy.get('ip') port = proxy.get('port') if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, 'http')) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) self.cm.close() # 关闭数据库连接
class GouBanJia(object): def __init__(self): """全网代理IP的爬虫""" self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url = "http://www.goubanjia.com/" def parser(self): """ ↓此函数借鉴于↓ https://blog.csdn.net/weixin_37586648/article/details/78868015 """ html = self.getter.rget_data(self.url) # 解析html soup = BeautifulSoup(html, "lxml") # ---------↓自己添加获取类型↓--------- lx_list = list() ip_port_list = list() for tr in soup.find_all('tr'): temp = tr.find_all('td') if temp: lx = temp[2].get_text() lx_list.append(lx) # ---------↑自己添加获取类型↑--------- # 获取所有的ip的td td_list = soup.select('td[class="ip"]') for td in td_list: # 获取当前td所以的子标签 child_list = td.find_all() ip_port = "" for child in child_list: if 'style' in child.attrs.keys(): if child.attrs['style'].replace( ' ', '') == "display:inline-block;": if child.string is not None: ip_port = ip_port + child.string # 过滤出端口号 elif 'class' in child.attrs.keys(): class_list = child.attrs['class'] if 'port' in class_list: port = self.mk_port(class_list[1]) # 拼接端口 ip_port = ip_port + ":" + str(port) else: if child.string is not None: ip_port = ip_port + child.string # 接下来是我自己的 ip_port_list.append(ip_port) return lx_list, ip_port_list def run(self): lx_list, ip_port_list = self.parser() sql_list = list() for ip_port in ip_port_list: lx = lx_list[ip_port_list.index(ip_port)] ip = ip_port.split(":")[0] port = ip_port.split(":")[1] # 对ip进行布隆去重 if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, lx)) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) self.cm.close() # 关闭数据库连接 def mk_port(self, port_word): word = list(port_word) num_list = [] for item in word: num = 'ABCDEFGHIZ'.find(item) num_list.append(str(num)) port = int("".join(num_list)) >> 0x3 return port
class XiCiDaiLi(object): def __init__(self): """西刺代理爬虫""" self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') def parser(self, total_url, xpath_str, format_url): total = int( etree.HTML(self.getter.rget_data(total_url)).xpath(xpath_str)[0]) time.sleep(2) for pageNum in range(1, total): url = format_url.format(pageNum) # 拼接url try: html = self.getter.rget_data(url) # 访问页面 except Exception as e: print("出先错误为{}".format(e)) continue time.sleep(3) # 睡两秒,防止被干掉 soup = BeautifulSoup(html, 'lxml') proxy_list = soup.find('table', { 'id': 'ip_list' }).find_all('tr')[1:] sql_list = list() for proxy in proxy_list: tmp = proxy.find_all('td') ip = tmp[1].get_text() port = tmp[2].get_text() lx = tmp[5].get_text().lower() if "socks" in lx: lx = "http" if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, lx)) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) def run(self): # 国内高匿代理IP self.parser("http://www.xicidaili.com/nn/1", '//div[@id="body"]/div[2]/a[last()-1]/text()', "http://www.xicidaili.com/nn/{}") time.sleep(2) # 国内透明代理IP self.parser("http://www.xicidaili.com/nt/1", '//div[@id="body"]/div[2]/a[last()-1]/text()', "http://www.xicidaili.com/nt/{}") time.sleep(2) # HTTPS代理IP self.parser("http://www.xicidaili.com/wn/1", '//div[@id="body"]/div[2]/a[last()-1]/text()', "http://www.xicidaili.com/wn/{}") time.sleep(2) # HTTP代理IP self.parser("http://www.xicidaili.com/wt/1", '//div[@id="body"]/div[2]/a[last()-1]/text()', "http://www.xicidaili.com/wt/{}") self.cm.close()