class ThreeFourSixFour(object): def __init__(self): """3464网站的爬虫""" self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url = "http://www.3464.com/data/Proxy/http/" def parser(self): html = self.getter.rget_data(self.url) html_ele = etree.HTML(html) tr_list = html_ele.xpath( '//div[@class="CommonBody"]/table[6]//table//tr')[1:] sql_list = list() for tr in tr_list: try: ip = tr.xpath('./td[1]/text()')[0] port = tr.xpath('./td[2]/text()')[0] except Exception: continue # 校验是否已有 if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, "http")) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql)
class YouDaiLi(object): def __init__(self): self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url = "http://www.youdaili.net/Daili/http/" def parser(self): url = etree.HTML(self.getter.rget_data( self.url)).xpath('//div[@class="chunlist"]/ul/li[1]/p/a/@href')[0] time.sleep(2) html = self.getter.rget_data(url) soup = BeautifulSoup(html, 'lxml') p_tag = soup.find_all('p') sql_list = list() for p in p_tag: ip_list = re.findall( '(.*?) ———— (.*?) ———— (.*?) ———— (.*?) ', p.get_text()) if ip_list: # [('61.130.226.39', '20753', '浙江湖州', 'HTTPS')] ip = ip_list[0][0] port = ip_list[0][1] lx = ip_list[0][3] if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, lx)) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 # print(sql) self.cm.exe(sql) self.cm.close()
class ThreeThreeSixSix(object): def __init__(self): """3366代理网站的爬虫""" self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url = "http://www.ip3366.net/?stype={}&page={}" def parser(self): for stype in range(1, 6): for page in range(1, 11): url = self.url.format(stype, page) time.sleep(2) try: html = self.getter.rget_data(url) except Exception: continue html_ele = etree.HTML(html) tr_list = html_ele.xpath('//table/tbody/tr') sql_list = list() for tr in tr_list: ip = tr.xpath('./td[1]/text()')[0] port = tr.xpath('./td[2]/text()')[0] lx = tr.xpath('./td[4]/text()')[0] if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, lx)) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql)
class TXTIPPage(object): def __init__(self): self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url_list = [ 'https://www.rmccurdy.com/scripts/proxy/output/http/ALL', 'https://www.rmccurdy.com/scripts/proxy/output/socks/ALL', 'https://www.rmccurdy.com/scripts/proxy/proxylist.txt', 'http://www.proxylists.net/http_highanon.txt', 'http://ab57.ru/downloads/proxyold.txt' ] def run(self): for url in self.url_list: data = self.getter.rget_data(url) ip_list = re.findall('\d+\.\d+\.\d+\.\d+:\d+', data) temp_l = [[ipport.split(":")[0], ipport.split(":")[1]] for ipport in ip_list] sql_list = list() for temp in temp_l: ip = temp[0] port = temp[1] if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, 'http')) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) self.cm.close()
class CoderBusy(object): def __init__(self): """码农很忙代理爬虫""" self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url = [ 'https://proxy.coderbusy.com/', # 首页 'https://proxy.coderbusy.com/classical/https-ready.aspx?page={}', # HTTPS代理 'https://proxy.coderbusy.com/classical/post-ready.aspx?page={}', # 支持POST的代理 'https://proxy.coderbusy.com/classical/anonymous-type/transparent.aspx?page={}', # 透明代理 'https://proxy.coderbusy.com/classical/anonymous-type/anonymous.aspx?page={}', # 匿名代理 'https://proxy.coderbusy.com/classical/anonymous-type/highanonymous.aspx?page={}', # 高匿代理 ] def parser(self, page_lx): page = 1 while True: try: html = self.getter.rget_data(page_lx.format(page)) except Exception as e: print("出先错误为{}".format(e)) continue time.sleep(2) # 睡两秒,防止被干掉 next_page = etree.HTML(html).xpath('//nav[@class="text-center"]/ul/li[@title="下一页"]/a/@href') soup = BeautifulSoup(html, 'lxml') proxies_list = soup.find('table', 'table').find_all('tr') sql_list = list() for proxy in proxies_list: temp = proxy.find_all('td') if temp: # 获取ip ip = temp[0].get_text().strip() # 获取端口 port = int(temp[2].get("data-i")) for num in ip.split('.'): port -= int(num) # 获取类型 if temp[8].find('i'): lx = 'https' else: lx = 'http' # 校验是否已有 if not self.bf.isContains(ip): sql_list.append("""INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""".format(ip, port, lx)) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) if next_page: page += 1 else: break def run(self): for page_lx in self.url: time.sleep(2) self.parser(page_lx)
class Horocn(object): def __init__(self): """ 蜻蜓代理的爬虫 https://proxy.horocn.com/free-proxy.html?page={} """ self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.img = ImagePort() self.port = "12345" self.url = "https://proxy.horocn.com/free-proxy.html?page={}" def parser(self): # page = 1 page = 3000 while True: try: html = self.getter.rget_data(self.url.format(page)) except Exception as e: print("出现错误为{}".format(e)) continue time.sleep(2) # 睡两秒,防止被干掉 html_ele = etree.HTML(html) next_page = html_ele.xpath('//ul[@class="pager"]//a[text()="下一页 →"]/@href')[0] tr_list = html_ele.xpath('//table/tbody/tr') sql_list = list() path_list = list() for tr in tr_list: ip = tr.xpath('./th[1]/text()')[0] # 开始保存图片 base_port_image = tr.xpath('./th[2]/img/@src')[0] photo = base64.b64decode(re.search(r"data:image/jpeg;base64,(.*)", base_port_image).group(1)) path = "./{}.jpg".format(tr_list.index(tr)) path_list.append(path) # 将其放入列表一次性操作 with open(path, "wb") as f: f.write(photo) for times in range(10): try: self.port = int(self.img.run(path)) except Exception: continue else: break # 校验是否已有 if not self.bf.isContains(ip): sql_list.append("""INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""".format(ip, self.port, "http")) self.bf.insert(ip) else: pass for path in path_list: os.remove(path) for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) if next_page != "javascript:;": page += 1 else: break
class MiMi(object): def __init__(self): """秘密代理的IP抓取""" self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url = [ "http://www.mimiip.com/gngao/{}", # 高匿代理IP "http://www.mimiip.com/gnpu/{}", # 普匿代理IP "http://www.mimiip.com/gntou/{}", # 透明代理IP "http://www.mimiip.com/hw/{}" # 国外代理IP ] def parser(self, page_lx): page = 1 while True: try: html = self.getter.rget_data(page_lx.format(page)) except Exception as e: print("出先错误为{}".format(e)) continue time.sleep(2) # 睡两秒,防止被干掉 next_page = etree.HTML(html).xpath('//div[@class="pagination"]//*[text()="下一页 ›"]/@href') soup = BeautifulSoup(html, 'lxml') proxies_list = soup.find('table', 'list').find_all('tr') sql_list = list() for proxy in proxies_list: temp = proxy.find_all('td') if temp: # 获取ip ip = temp[0].get_text() # 获取端口 port = temp[1].get_text() # 获取类型 lx = temp[4].get_text().lower() # 校验是否已有 if not self.bf.isContains(ip): sql_list.append("""INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""".format(ip, port, lx)) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) if next_page: page += 1 else: break def run(self): for page_lx in self.url: time.sleep(2) self.parser(page_lx)
class KuaiDaiLi(object): def __init__(self): """快代理的爬虫""" self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') def parser(self, total_url, xpath_str, format_url): total = int( etree.HTML(self.getter.rget_data(total_url)).xpath(xpath_str)[0]) time.sleep(2) for pageNum in range(1, total): url = format_url.format(pageNum) # 拼接url try: html = self.getter.rget_data(url) # 访问页面 except Exception as e: print("出先错误为{}".format(e)) continue time.sleep(2) # 睡两秒,防止被干掉 soup = BeautifulSoup(html, 'lxml') proxy_list = soup.find( 'table', { 'class': 'table table-bordered table-striped' }).find('tbody') sql_list = list() for proxy in proxy_list.find_all('tr'): tmp = proxy.find_all('td') ip = tmp[0].get_text() port = tmp[1].get_text() lx = tmp[3].get_text().lower() if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, lx)) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) def run(self): # 获取国内高匿部分 self.parser("https://www.kuaidaili.com/free/inha/1/", '//div[@id="listnav"]/ul/li[last()-1]/a/text()', "https://www.kuaidaili.com/free/inha/{}/") # 获取国内普通部分 time.sleep(3) self.parser("https://www.kuaidaili.com/free/intr/1/", '//div[@id="listnav"]/ul/li[last()-1]/a/text()', "https://www.kuaidaili.com/free/intr/{}/") self.cm.close() # 关闭数据库连接
class HinkyDink(object): def __init__(self): """Hinky Dink's代理的爬虫""" self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') def parser(self, total_url, xpath_str, format_url): total = int( etree.HTML(self.getter.rget_data(total_url)).xpath(xpath_str) [0].strip("[").strip("]")) time.sleep(2) for pageNum in range(1, total): if pageNum == 1: url = total_url else: url = format_url.format(pageNum) # 拼接url try: html = self.getter.rget_data(url) # 访问页面 except Exception as e: print("出先错误为{}".format(e)) continue time.sleep(2) # 睡两秒,防止被干掉 html_ele = etree.HTML(html) tr_list = html_ele.xpath( '//table[2]//tr[2]/td[3]/table//tr/td//table//tr[@class="text"]' ) sql_list = list() for tr in tr_list: ip = tr.xpath('./td[1]/text()')[0] port = tr.xpath('./td[2]/text()')[0] if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, 'http')) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) def run(self): self.parser( "http://www.mrhinkydink.com/proxies.htm", # 第一页url '//table[2]//tr[2]/td[3]/table//tr/td//table//tr[last()]/td/a[last()]/text()', "http://www.mrhinkydink.com/proxies{}.htm" # 第二页,开始格式化的url )
class ThreeSixZero(object): def __init__(self): """360代理的爬虫""" self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url = [ "http://www.swei360.com/free/?page={}", # 国内高匿代理 "http://www.swei360.com/free/?stype=2&page={}", # 国内普通代理 "http://www.swei360.com/free/?stype=3&page={}", # 国外高匿代理 "http://www.swei360.com/free/?stype=4&page={}" # 国外普通代理 ] def parser(self, format_url): for pageNum in range(1, 8): # 他只有7页 url = format_url.format(pageNum) # 拼接url try: html = self.getter.rget_data(url) # 访问页面 except Exception as e: print("出现错误为{}".format(e)) continue time.sleep(2) # 睡两秒,防止被干掉 html_ele = etree.HTML(html) tr_list = html_ele.xpath('//table/tbody/tr') sql_list = list() for tr in tr_list: ip = tr.xpath('./td[1]/text()')[0] port = tr.xpath('./td[2]/text()')[0] lx = tr.xpath('./td[4]/text()')[0] if not self.bf.isContains(ip): sql_list.append("""INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""".format(ip, port, lx)) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) def run(self): for format_url in self.url: time.sleep(2) self.parser(format_url)
class SixSixIP(object): def __init__(self): """66ip代理的爬虫""" self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url = "http://www.66ip.cn/{}.html" def run(self): total = int( etree.HTML( self.getter.rget_data("http://www.66ip.cn/1.html")).xpath( '//div[@id="PageList"]/a[last()-1]/text()')[0]) time.sleep(3) # for pageNum in range(1, total): # for pageNum in range(1176, total): for pageNum in range(1200, total): url = self.url.format(pageNum) # 拼接url try: html = self.getter.rget_data(url) # 访问页面 except Exception as e: print("出先错误为{}".format(e)) continue time.sleep(3) # 睡两秒,防止被干掉 soup = BeautifulSoup(html, 'lxml') proxy_list = soup.find('table', {"border": "2px"}) sql_list = list() for proxy in proxy_list.find_all('tr')[1:]: ip = proxy.find_all('td')[0].get_text() # 获取ip port = proxy.find_all('td')[1].get_text() # 获取端口 if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, 'http')) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库) self.cm.exe(sql) self.cm.close()
class ListProxy(object): def __init__(self): self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url = "https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-{}" def parser(self): total = int(etree.HTML(self.getter.rget_data(self.url.format(1))).xpath('//div[@id="page"]/table[3]/tr/td[1]/a[last()]/text()')[0].strip('[').strip(']')) time.sleep(3) for pageNum in range(1, total): url = self.url.format(pageNum) # 拼接url try: html = self.getter.rget_data(url) # 访问页面 except Exception as e: print("出先错误为{}".format(e)) continue time.sleep(3) # 睡两秒,防止被干掉 soup = BeautifulSoup(html, 'lxml') proxy_list = soup.find("table", 'bg').find_all('tr') sql_list = list() for proxy in proxy_list: tmp = proxy.find_all("td") if tmp: ip = tmp[1].get_text() port = tmp[2].get_text() lx = tmp[6].get_text() if lx == "yes": lx = 'https' else: lx = 'http' if not self.bf.isContains(ip): sql_list.append("""INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""".format(ip, port, lx)) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) self.cm.close()
class Data5U(object): def __init__(self): """data5u代理的爬虫""" self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url = "http://www.data5u.com/free/index.shtml" def parser(self): html = self.getter.rget_data(self.url) soup = BeautifulSoup(html, "lxml") proxy_list = soup.find_all('ul', {'class': "l2"}) sql_list = list() # 一次性操作数据库 for proxy in proxy_list: tmp = proxy.find_all('li') ip = tmp[0].get_text() port_zimu = list(tmp[1].attrs.values())[0][1] lx = tmp[3].get_text() port = self.mk_port(port_zimu) # 对ip进行布隆去重 if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, lx)) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) self.cm.close() # 关闭数据库连接 def mk_port(self, port_word): word = list(port_word) num_list = [] for item in word: num = 'ABCDEFGHIZ'.find(item) num_list.append(str(num)) port = int("".join(num_list)) >> 0x3 return port
class IP181(object): def __init__(self): self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url = 'http://www.ip181.com/' def parser(self): js_data = self.getter.rget_data(self.url) sql_list = list() # 一次性操作数据库 for proxy in json.loads(js_data).get("RESULT"): ip = proxy.get('ip') port = proxy.get('port') if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, 'http')) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) self.cm.close() # 关闭数据库连接
class ProxyDB(object): def __init__(self): """ProxyDB代理的爬虫""" self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url = "http://proxydb.net/?offset={}" def parser(self, page_lx): # page = 0 page = 150 while True: try: html = self.getter.rget_data(page_lx.format(page * 15)) except Exception as e: print("出先错误为{}".format(e)) continue time.sleep(3) # 睡两秒,防止被干掉 html_ele = etree.HTML(html) next_page = html_ele.xpath('//nav/ul/li[2]/a/@href') add_num = html_ele.xpath('//div[@style="display:none"]/@*')[1] td_list = html_ele.xpath( '//table[contains(@class, "table")]/tbody/tr/td[1]/script/text()' ) lx_list = html_ele.xpath( '//table[contains(@class, "table")]/tbody/tr/td[5]/text()') sql_list = list() for td in td_list: ip_h_reve = re.search(r"'(.*?)'.split", td).group(1) # 提取ip头部 ip_t_b64 = re.search(r"atob\('(.*?)'.replace", td).group(1) # 提取base64编码部分 p = re.search(r"pp = \((\d+) - \(", td).group(1) # 提取待相加的port ip, port = self.mk_ip_port(ip_h_reve, ip_t_b64, p, add_num) lx = lx_list[td_list.index(td)].strip().lower() if "socket" in lx: lx = "http" # 校验是否已有 if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, lx)) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) if next_page: page += 1 else: break def mk_ip_port(self, ip_h_reve, ip_t_b64, p, add_n): """ 将网页上抓取下来的参数,直接组织成ip和port :param ip_h_reve: 待翻转的ip前一部分 :param ip_t_b64: base64加密部分的字母 :param p: 直接抓取到的端口,需要相加 :param add_n: 要相加的值 :return: """ l_ip_head = list(ip_h_reve) l_ip_head.reverse() ip_head = "" for char in l_ip_head: ip_head += char # 下面这句codecs.getdecoder("unicode_escape")(ip_t_b64)[0]超级重要.取消了转义而使用的 ip_tail = base64.b64decode( codecs.getdecoder("unicode_escape")(ip_t_b64)[0]).decode() ip = ip_head + ip_tail port = int(p) + int(add_n) return ip, port def run(self): self.parser(self.url)
class CoolProxy(object): def __init__(self): """cool-proxy.net代理的爬虫""" self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url = "https://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:{}" def parser(self): page = 1 while True: try: html = self.getter.rget_data(self.url.format(page)) except Exception as e: print("出现错误为{}".format(e)) continue time.sleep(2) # 睡两秒,防止被干掉 next_page = etree.HTML(html).xpath( '//table//tr[last()]//span[last()]/a') soup = BeautifulSoup(html, 'lxml') tr_list = soup.find('table').find_all('tr') sql_list = list() for tr in tr_list: temp = tr.find_all('td') if temp: try: ip = self.mk_ip( re.search( r"str_rot13\(\"(.*?)\"\)", temp[0].find('script').get_text()).group(1)) except Exception: continue # 里面有混淆的tr port = temp[1].get_text() # 校验是否已有 if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, "http")) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) if next_page: page += 1 else: break def mk_ip(self, en_ip): """ 将拿到的-->`ZGH5Ywt5YwVlBF42At==`这种ip解码成可用ip-->159.89.229.66 :param en_ip:爬取到的加密ip :return:解密后的ip """ letter_str = "" for char in en_ip: if char in "0123456789==": # 数字和等号用来混淆,直接拼接 letter_str += char else: head = ord(char[0]) # 获得该字母的Unicode的编码 tail = 13 if char.lower() < 'n' else -13 # 盐 letter_str += chr(head + tail) # 讲加密后的值解析成字母并拼接 return base64.b64decode(letter_str).decode() # base64解码拼接后的字符串
class XiCiDaiLi(object): def __init__(self): """西刺代理爬虫""" self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') def parser(self, total_url, xpath_str, format_url): total = int( etree.HTML(self.getter.rget_data(total_url)).xpath(xpath_str)[0]) time.sleep(2) for pageNum in range(1, total): url = format_url.format(pageNum) # 拼接url try: html = self.getter.rget_data(url) # 访问页面 except Exception as e: print("出先错误为{}".format(e)) continue time.sleep(3) # 睡两秒,防止被干掉 soup = BeautifulSoup(html, 'lxml') proxy_list = soup.find('table', { 'id': 'ip_list' }).find_all('tr')[1:] sql_list = list() for proxy in proxy_list: tmp = proxy.find_all('td') ip = tmp[1].get_text() port = tmp[2].get_text() lx = tmp[5].get_text().lower() if "socks" in lx: lx = "http" if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, lx)) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) def run(self): # 国内高匿代理IP self.parser("http://www.xicidaili.com/nn/1", '//div[@id="body"]/div[2]/a[last()-1]/text()', "http://www.xicidaili.com/nn/{}") time.sleep(2) # 国内透明代理IP self.parser("http://www.xicidaili.com/nt/1", '//div[@id="body"]/div[2]/a[last()-1]/text()', "http://www.xicidaili.com/nt/{}") time.sleep(2) # HTTPS代理IP self.parser("http://www.xicidaili.com/wn/1", '//div[@id="body"]/div[2]/a[last()-1]/text()', "http://www.xicidaili.com/wn/{}") time.sleep(2) # HTTP代理IP self.parser("http://www.xicidaili.com/wt/1", '//div[@id="body"]/div[2]/a[last()-1]/text()', "http://www.xicidaili.com/wt/{}") self.cm.close()
class GouBanJia(object): def __init__(self): """全网代理IP的爬虫""" self.getter = GETTER(rtimes=10) self.cm = ConnMysql() self.bf = BloomFilter(key='allip') self.url = "http://www.goubanjia.com/" def parser(self): """ ↓此函数借鉴于↓ https://blog.csdn.net/weixin_37586648/article/details/78868015 """ html = self.getter.rget_data(self.url) # 解析html soup = BeautifulSoup(html, "lxml") # ---------↓自己添加获取类型↓--------- lx_list = list() ip_port_list = list() for tr in soup.find_all('tr'): temp = tr.find_all('td') if temp: lx = temp[2].get_text() lx_list.append(lx) # ---------↑自己添加获取类型↑--------- # 获取所有的ip的td td_list = soup.select('td[class="ip"]') for td in td_list: # 获取当前td所以的子标签 child_list = td.find_all() ip_port = "" for child in child_list: if 'style' in child.attrs.keys(): if child.attrs['style'].replace( ' ', '') == "display:inline-block;": if child.string is not None: ip_port = ip_port + child.string # 过滤出端口号 elif 'class' in child.attrs.keys(): class_list = child.attrs['class'] if 'port' in class_list: port = self.mk_port(class_list[1]) # 拼接端口 ip_port = ip_port + ":" + str(port) else: if child.string is not None: ip_port = ip_port + child.string # 接下来是我自己的 ip_port_list.append(ip_port) return lx_list, ip_port_list def run(self): lx_list, ip_port_list = self.parser() sql_list = list() for ip_port in ip_port_list: lx = lx_list[ip_port_list.index(ip_port)] ip = ip_port.split(":")[0] port = ip_port.split(":")[1] # 对ip进行布隆去重 if not self.bf.isContains(ip): sql_list.append( """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""" .format(ip, port, lx)) self.bf.insert(ip) else: pass for sql in sql_list: # 一次性操作数据库 self.cm.exe(sql) self.cm.close() # 关闭数据库连接 def mk_port(self, port_word): word = list(port_word) num_list = [] for item in word: num = 'ABCDEFGHIZ'.find(item) num_list.append(str(num)) port = int("".join(num_list)) >> 0x3 return port