Ejemplo n.º 1
0
class TXTIPPage(object):
    def __init__(self):
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url_list = [
            'https://www.rmccurdy.com/scripts/proxy/output/http/ALL',
            'https://www.rmccurdy.com/scripts/proxy/output/socks/ALL',
            'https://www.rmccurdy.com/scripts/proxy/proxylist.txt',
            'http://www.proxylists.net/http_highanon.txt',
            'http://ab57.ru/downloads/proxyold.txt'
        ]

    def run(self):
        for url in self.url_list:
            data = self.getter.rget_data(url)
            ip_list = re.findall('\d+\.\d+\.\d+\.\d+:\d+', data)
            temp_l = [[ipport.split(":")[0],
                       ipport.split(":")[1]] for ipport in ip_list]
            sql_list = list()
            for temp in temp_l:
                ip = temp[0]
                port = temp[1]
                if not self.bf.isContains(ip):
                    sql_list.append(
                        """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                        .format(ip, port, 'http'))
                    self.bf.insert(ip)
                else:
                    pass
            for sql in sql_list:  # 一次性操作数据库
                self.cm.exe(sql)
        self.cm.close()
Ejemplo n.º 2
0
class YouDaiLi(object):
    def __init__(self):
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url = "http://www.youdaili.net/Daili/http/"

    def parser(self):
        url = etree.HTML(self.getter.rget_data(
            self.url)).xpath('//div[@class="chunlist"]/ul/li[1]/p/a/@href')[0]
        time.sleep(2)
        html = self.getter.rget_data(url)
        soup = BeautifulSoup(html, 'lxml')
        p_tag = soup.find_all('p')
        sql_list = list()
        for p in p_tag:
            ip_list = re.findall(
                '(.*?)    ————    (.*?)    ————    (.*?)    ————    (.*?)    ',
                p.get_text())
            if ip_list:
                # [('61.130.226.39', '20753', '浙江湖州', 'HTTPS')]
                ip = ip_list[0][0]
                port = ip_list[0][1]
                lx = ip_list[0][3]
                if not self.bf.isContains(ip):
                    sql_list.append(
                        """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                        .format(ip, port, lx))
                    self.bf.insert(ip)
                else:
                    pass
        for sql in sql_list:  # 一次性操作数据库
            # print(sql)
            self.cm.exe(sql)
        self.cm.close()
Ejemplo n.º 3
0
class KuaiDaiLi(object):
    def __init__(self):
        """快代理的爬虫"""
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')

    def parser(self, total_url, xpath_str, format_url):
        total = int(
            etree.HTML(self.getter.rget_data(total_url)).xpath(xpath_str)[0])
        time.sleep(2)
        for pageNum in range(1, total):
            url = format_url.format(pageNum)  # 拼接url
            try:
                html = self.getter.rget_data(url)  # 访问页面
            except Exception as e:
                print("出先错误为{}".format(e))
                continue
            time.sleep(2)  # 睡两秒,防止被干掉
            soup = BeautifulSoup(html, 'lxml')
            proxy_list = soup.find(
                'table', {
                    'class': 'table table-bordered table-striped'
                }).find('tbody')
            sql_list = list()
            for proxy in proxy_list.find_all('tr'):
                tmp = proxy.find_all('td')
                ip = tmp[0].get_text()
                port = tmp[1].get_text()
                lx = tmp[3].get_text().lower()
                if not self.bf.isContains(ip):
                    sql_list.append(
                        """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                        .format(ip, port, lx))
                    self.bf.insert(ip)
                else:
                    pass
            for sql in sql_list:  # 一次性操作数据库
                self.cm.exe(sql)

    def run(self):
        # 获取国内高匿部分
        self.parser("https://www.kuaidaili.com/free/inha/1/",
                    '//div[@id="listnav"]/ul/li[last()-1]/a/text()',
                    "https://www.kuaidaili.com/free/inha/{}/")
        # 获取国内普通部分
        time.sleep(3)
        self.parser("https://www.kuaidaili.com/free/intr/1/",
                    '//div[@id="listnav"]/ul/li[last()-1]/a/text()',
                    "https://www.kuaidaili.com/free/intr/{}/")
        self.cm.close()  # 关闭数据库连接
Ejemplo n.º 4
0
class ListProxy(object):

    def __init__(self):
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url = "https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-{}"

    def parser(self):
        total = int(etree.HTML(self.getter.rget_data(self.url.format(1))).xpath('//div[@id="page"]/table[3]/tr/td[1]/a[last()]/text()')[0].strip('[').strip(']'))
        time.sleep(3)
        for pageNum in range(1, total):
            url = self.url.format(pageNum)  # 拼接url
            try:
                html = self.getter.rget_data(url)  # 访问页面
            except Exception as e:
                print("出先错误为{}".format(e))
                continue
            time.sleep(3)  # 睡两秒,防止被干掉
            soup = BeautifulSoup(html, 'lxml')
            proxy_list = soup.find("table", 'bg').find_all('tr')
            sql_list = list()
            for proxy in proxy_list:
                tmp = proxy.find_all("td")
                if tmp:
                    ip = tmp[1].get_text()
                    port = tmp[2].get_text()
                    lx = tmp[6].get_text()
                    if lx == "yes":
                        lx = 'https'
                    else:
                        lx = 'http'
                    if not self.bf.isContains(ip):
                        sql_list.append("""INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""".format(ip, port, lx))
                        self.bf.insert(ip)
                    else:
                        pass
            for sql in sql_list:  # 一次性操作数据库
                self.cm.exe(sql)
        self.cm.close()
Ejemplo n.º 5
0
class SixSixIP(object):
    def __init__(self):
        """66ip代理的爬虫"""
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url = "http://www.66ip.cn/{}.html"

    def run(self):
        total = int(
            etree.HTML(
                self.getter.rget_data("http://www.66ip.cn/1.html")).xpath(
                    '//div[@id="PageList"]/a[last()-1]/text()')[0])
        time.sleep(3)
        # for pageNum in range(1, total):
        # for pageNum in range(1176, total):
        for pageNum in range(1200, total):
            url = self.url.format(pageNum)  # 拼接url
            try:
                html = self.getter.rget_data(url)  # 访问页面
            except Exception as e:
                print("出先错误为{}".format(e))
                continue
            time.sleep(3)  # 睡两秒,防止被干掉
            soup = BeautifulSoup(html, 'lxml')
            proxy_list = soup.find('table', {"border": "2px"})
            sql_list = list()
            for proxy in proxy_list.find_all('tr')[1:]:
                ip = proxy.find_all('td')[0].get_text()  # 获取ip
                port = proxy.find_all('td')[1].get_text()  # 获取端口
                if not self.bf.isContains(ip):
                    sql_list.append(
                        """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                        .format(ip, port, 'http'))
                    self.bf.insert(ip)
                else:
                    pass
            for sql in sql_list:  # 一次性操作数据库)
                self.cm.exe(sql)
        self.cm.close()
Ejemplo n.º 6
0
class Data5U(object):
    def __init__(self):
        """data5u代理的爬虫"""
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url = "http://www.data5u.com/free/index.shtml"

    def parser(self):
        html = self.getter.rget_data(self.url)
        soup = BeautifulSoup(html, "lxml")
        proxy_list = soup.find_all('ul', {'class': "l2"})
        sql_list = list()  # 一次性操作数据库
        for proxy in proxy_list:
            tmp = proxy.find_all('li')
            ip = tmp[0].get_text()
            port_zimu = list(tmp[1].attrs.values())[0][1]
            lx = tmp[3].get_text()
            port = self.mk_port(port_zimu)
            # 对ip进行布隆去重
            if not self.bf.isContains(ip):
                sql_list.append(
                    """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                    .format(ip, port, lx))
                self.bf.insert(ip)
            else:
                pass
        for sql in sql_list:  # 一次性操作数据库
            self.cm.exe(sql)
        self.cm.close()  # 关闭数据库连接

    def mk_port(self, port_word):
        word = list(port_word)
        num_list = []
        for item in word:
            num = 'ABCDEFGHIZ'.find(item)
            num_list.append(str(num))
        port = int("".join(num_list)) >> 0x3
        return port
Ejemplo n.º 7
0
class IP181(object):
    def __init__(self):
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url = 'http://www.ip181.com/'

    def parser(self):
        js_data = self.getter.rget_data(self.url)
        sql_list = list()  # 一次性操作数据库
        for proxy in json.loads(js_data).get("RESULT"):
            ip = proxy.get('ip')
            port = proxy.get('port')
            if not self.bf.isContains(ip):
                sql_list.append(
                    """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                    .format(ip, port, 'http'))
                self.bf.insert(ip)
            else:
                pass
        for sql in sql_list:  # 一次性操作数据库
            self.cm.exe(sql)
        self.cm.close()  # 关闭数据库连接
Ejemplo n.º 8
0
class GouBanJia(object):
    def __init__(self):
        """全网代理IP的爬虫"""
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url = "http://www.goubanjia.com/"

    def parser(self):
        """
        ↓此函数借鉴于↓
        https://blog.csdn.net/weixin_37586648/article/details/78868015
        """
        html = self.getter.rget_data(self.url)
        # 解析html
        soup = BeautifulSoup(html, "lxml")
        # ---------↓自己添加获取类型↓---------
        lx_list = list()
        ip_port_list = list()
        for tr in soup.find_all('tr'):
            temp = tr.find_all('td')
            if temp:
                lx = temp[2].get_text()
                lx_list.append(lx)
        # ---------↑自己添加获取类型↑---------
        # 获取所有的ip的td
        td_list = soup.select('td[class="ip"]')
        for td in td_list:
            # 获取当前td所以的子标签
            child_list = td.find_all()
            ip_port = ""
            for child in child_list:
                if 'style' in child.attrs.keys():
                    if child.attrs['style'].replace(
                            ' ', '') == "display:inline-block;":
                        if child.string is not None:
                            ip_port = ip_port + child.string
                # 过滤出端口号
                elif 'class' in child.attrs.keys():
                    class_list = child.attrs['class']
                    if 'port' in class_list:
                        port = self.mk_port(class_list[1])
                        # 拼接端口
                        ip_port = ip_port + ":" + str(port)
                else:
                    if child.string is not None:
                        ip_port = ip_port + child.string
            # 接下来是我自己的
            ip_port_list.append(ip_port)
        return lx_list, ip_port_list

    def run(self):
        lx_list, ip_port_list = self.parser()
        sql_list = list()
        for ip_port in ip_port_list:
            lx = lx_list[ip_port_list.index(ip_port)]
            ip = ip_port.split(":")[0]
            port = ip_port.split(":")[1]
            # 对ip进行布隆去重
            if not self.bf.isContains(ip):
                sql_list.append(
                    """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                    .format(ip, port, lx))
                self.bf.insert(ip)
            else:
                pass
        for sql in sql_list:  # 一次性操作数据库
            self.cm.exe(sql)
        self.cm.close()  # 关闭数据库连接

    def mk_port(self, port_word):
        word = list(port_word)
        num_list = []
        for item in word:
            num = 'ABCDEFGHIZ'.find(item)
            num_list.append(str(num))
        port = int("".join(num_list)) >> 0x3
        return port
Ejemplo n.º 9
0
class XiCiDaiLi(object):
    def __init__(self):
        """西刺代理爬虫"""
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')

    def parser(self, total_url, xpath_str, format_url):
        total = int(
            etree.HTML(self.getter.rget_data(total_url)).xpath(xpath_str)[0])
        time.sleep(2)
        for pageNum in range(1, total):
            url = format_url.format(pageNum)  # 拼接url
            try:
                html = self.getter.rget_data(url)  # 访问页面
            except Exception as e:
                print("出先错误为{}".format(e))
                continue
            time.sleep(3)  # 睡两秒,防止被干掉
            soup = BeautifulSoup(html, 'lxml')
            proxy_list = soup.find('table', {
                'id': 'ip_list'
            }).find_all('tr')[1:]
            sql_list = list()
            for proxy in proxy_list:
                tmp = proxy.find_all('td')
                ip = tmp[1].get_text()
                port = tmp[2].get_text()
                lx = tmp[5].get_text().lower()
                if "socks" in lx:
                    lx = "http"
                if not self.bf.isContains(ip):
                    sql_list.append(
                        """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                        .format(ip, port, lx))
                    self.bf.insert(ip)
                else:
                    pass
            for sql in sql_list:  # 一次性操作数据库
                self.cm.exe(sql)

    def run(self):
        # 国内高匿代理IP
        self.parser("http://www.xicidaili.com/nn/1",
                    '//div[@id="body"]/div[2]/a[last()-1]/text()',
                    "http://www.xicidaili.com/nn/{}")
        time.sleep(2)
        # 国内透明代理IP
        self.parser("http://www.xicidaili.com/nt/1",
                    '//div[@id="body"]/div[2]/a[last()-1]/text()',
                    "http://www.xicidaili.com/nt/{}")
        time.sleep(2)
        # HTTPS代理IP
        self.parser("http://www.xicidaili.com/wn/1",
                    '//div[@id="body"]/div[2]/a[last()-1]/text()',
                    "http://www.xicidaili.com/wn/{}")
        time.sleep(2)
        # HTTP代理IP
        self.parser("http://www.xicidaili.com/wt/1",
                    '//div[@id="body"]/div[2]/a[last()-1]/text()',
                    "http://www.xicidaili.com/wt/{}")
        self.cm.close()