def get_proxies(self):
        # 加载 Log 配置
        get_log_config()

        proxy_model_list = []

        print('正在爬取无忧代理……')

        response = super(Data5uSpider, self).get_proxies()
        selector = etree.HTML(response.text)

        infos = selector.xpath('//ul[@class="l2"]')

        for i, info in enumerate(infos):
            try:
                ip = info.xpath('//ul[@class="l2"]/span[1]/li/text()')[i]  # ip
                port = info.xpath('//ul[@class="l2"]/span[2]/li/text()')[
                    i]  # 端口
                anonymity = info.xpath(
                    '//ul[@class="l2"]/span[3]/li/a/text()')[i]  # 匿名度
                http_type = info.xpath(
                    '//ul[@class="l2"]/span[4]/li/a/text()')[i]  # 类型
                area = info.xpath('//ul[@class="l2"]/span[6]/li/a[1]/text()')[
                    i]  # 地区, 省
                area = area + info.xpath(
                    '//ul[@class="l2"]/span[6]/li/a[2]/text()')[i]  # 地区, 市
                speed = info.xpath('//ul[@class="l2"]/span[8]/li/text()')[
                    i]  # 速度

                print(ip + " | " + port + " | " + anonymity + " | " +
                      http_type + " | " + area + " | " + speed + " | ")

                if http_type == 'http' or http_type == 'https':
                    # print(http_type + "://" + ip + ":" + port)
                    proxy = Proxy()
                    proxy.set_ip(ip)
                    proxy.set_port(port)
                    proxy.set_http_type(http_type)
                    proxy.set_anonymity(anonymity)
                    proxy.set_area(area)
                    proxy.set_speed(speed)
                    proxy.set_agent(self.agent)
                    proxy.set_survival_time("")
                    proxy_model_list.append(proxy)
                else:
                    pass
            except Exception as e:
                logging.debug(e)

        logging.debug("抓取 " + self.agent + " 网站共计 " +
                      str(len(proxy_model_list)) + " 个代理")

        return proxy_model_list
    def get_proxies(self):

        # 加载 Log 配置
        get_log_config()

        proxy_model_list = []

        print('正在爬取快代理……')

        response = super(KuaidailiSpider, self).get_proxies()

        pattern = re.compile(
            '<tr>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>('
            '.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?</tr>',
            re.S)

        infos = re.findall(pattern, response.text)

        for item in infos:
            try:
                ip = item[0]  # ip
                port = item[1]  # 端口
                anonymity = item[2]  # 匿名度
                http_type = item[3]  # 类型
                area = item[4]  # 地区
                speed = item[5]  # 速度

                print(ip + " | " + port + " | " + anonymity + " | " +
                      http_type + " | " + area + " | " + speed)

                if http_type == 'HTTP' or http_type == 'HTTPS':
                    # print(type.lower() + "://" + ip + ":" + port)
                    proxy = Proxy()
                    proxy.set_ip(ip)
                    proxy.set_port(port)
                    proxy.set_http_type(http_type.lower())
                    proxy.set_anonymity(anonymity)
                    proxy.set_area(area)
                    proxy.set_speed(speed)
                    proxy.set_agent(self.agent)
                    proxy.set_survival_time("")
                    proxy_model_list.append(proxy)
            except Exception as e:
                logging.debug(e)

        logging.debug("抓取 " + self.agent + " 网站共计 " +
                      str(len(proxy_model_list)) + " 个代理")

        return proxy_model_list
Beispiel #3
0
    def get_proxies(self):
        # 加载 Log 配置
        get_log_config()

        proxy_model_list = []

        print('正在爬取西刺代理……')

        response = super(XiciSpider, self).get_proxies()
        selector = etree.HTML(response.text)

        infos = selector.xpath('//tr[@class="odd"]')

        for i, info in enumerate(infos):
            try:
                ip = info.xpath('./td[2]/text()')[0]  # ip
                port = info.xpath('./td[3]/text()')[0]  # 端口
                anonymity = info.xpath('./td[5]/text()')[0]  # 匿名度
                http_type = info.xpath('./td[6]/text()')[0]  # 类型
                area = info.xpath('./td[4]/a/text()')[0]  # 地区
                speed = info.xpath('./td[7]/div/@title')[0]  # 速度
                survival_time = info.xpath('./td[9]/text()')[0]  # 存活时间

                print(ip + " | " + port + " | " + anonymity + " | " +
                      http_type + " | " + area + " | " + speed + " | " +
                      survival_time)

                proxy = Proxy()
                proxy.set_ip(ip)
                proxy.set_port(port)
                proxy.set_http_type(http_type)
                proxy.set_anonymity(anonymity)
                # 处理空地区
                if area is None:
                    proxy.set_area('')
                else:
                    proxy.set_area(area)
                proxy.set_speed(speed)
                proxy.set_agent(self.agent)
                proxy.set_survival_time(survival_time)
                proxy_model_list.append(proxy)

            except Exception as e:
                logging.debug(e)

        logging.debug("抓取 " + self.agent + " 网站共计 " +
                      str(len(proxy_model_list)) + " 个代理")

        return proxy_model_list
Beispiel #4
0
    def __init__(self):
        # 加载 Log 配置
        get_log_config()

        self.connect = pymysql.connect(
            host='127.0.0.1',  #数据库地址
            port=3306,  # 数据库端口
            db='doctor',  # 数据库名
            user='******',  # 数据库用户名
            passwd='liu998wei',  # 数据库密码
            charset='utf8',  # 编码方式
            use_unicode=True)

        # 通过cursor执行增删查改
        self.cursor = self.connect.cursor()
Beispiel #5
0
    def get_proxies(self):

        get_log_config()

        proxy_model_list = []

        print('正在爬取西刺代理......')

        response = super(XiciSpider, self).get_proxies()
        selector = etree.HTML(response.text)

        infos = selector.xpath('//tr[@class="odd"]')

        for i, info in enumerate(infos):
            try:
                ip = info.xpath('./td[2]/text()')[0]
                port = info.xpath('./td[3]/text()')[0]
                anonymity = info.xpath('./td[5]/text()')[0]
                http_type = info.xpath('./td[6]/text()')[0]
                area = info.xpath('./td[4]/a/text()')[0]
                speed = info.xpath('./td[7]/div/@title')[0]
                survival_time = info.xpath('./td[9]/text()')[0]

                print(ip + " | " + port + " | " + anonymity + " | " +
                      http_type + " | " + area + " | " + speed + " | " +
                      survival_time)

                proxy = Proxy()
                proxy.set_ip(ip)
                proxy.set_port(port)
                proxy.set_http_type(http_type)
                proxy.set_anonymity(anonymity)
                if area is None:
                    proxy.set_area("")
                else:
                    proxy.set_area(area)

                proxy.set_speed(speed)
                proxy.set_agent(self.agent)
                proxy.set_survival_time(survival_time)
                proxy_model_list.append(proxy)
                print(len(proxy_model_list))
            except Exception as e:
                logging.debug(e)

        logging.debug(f"抓取 {self.agent} 网站共计 {len(proxy_model_list)} 个代理")

        return proxy_model_list
Beispiel #6
0
    def get_proxies(self):

        # 加载 Log 配置
        get_log_config()

        proxy_model_list = []

        print('正在爬取ip181……')

        response = super(Ip181Spider, self).get_proxies()
        # 这个网站的编码是 gb2312
        response.encoding = 'gb2312'
        selector = etree.HTML(response.text)

        infos = selector.xpath('//div[@class="col-md-12"]/table/tbody/tr')
        for i, info in enumerate(infos):
            try:
                ip = info.xpath('./td[1]/text()')[0]  # ip
                port = info.xpath('./td[2]/text()')[0]  # 端口
                anonymity = info.xpath('./td[3]/text()')[0]  # 匿名度
                http_type = info.xpath('./td[4]/text()')[0]  # 类型
                speed = info.xpath('./td[5]/text()')[0]  # 速度
                area = info.xpath('./td[6]/text()')[0]  # 地区
                # print(ip + " | " + port + " | " + anonymity + " | " + http_type + " | " + speed + " | " + area)

                if i == 1:
                    # 把标题过滤掉
                    pass
                else:
                    proxy = Proxy()
                    proxy.set_ip(ip)
                    proxy.set_port(port)
                    if http_type == 'HTTP,HTTPS':
                        proxy.set_http_type('http')
                    else:
                        proxy.set_http_type(http_type.lower())
                    proxy.set_anonymity(anonymity)
                    proxy.set_area(area)
                    proxy.set_speed(speed)
                    proxy.set_agent(self.agent)
                    proxy.set_survival_time("")
                    proxy_model_list.append(proxy)
            except Exception as e:
                logging.debug(e)
        logging.debug("抓取 " + self.agent + " 网站共计 " +
                      str(len(proxy_model_list)) + " 个代理")

        return proxy_model_list
Beispiel #7
0
    def __init__(self):

        get_log_config()

        self.__proxy_table = 'proxy'

        self.conn = pymysql.connect(host=config.MYSQL_HOST,
                                    port=3307,
                                    db=config.MYSQL_DBNAME,
                                    user=config.MYSQL_USER,
                                    passwd=config.MYSQL_PASSWORD,
                                    charset='utf8',
                                    use_unicode=False)

        with self.conn:
            self.cursor = self.conn.cursor()
Beispiel #8
0
    def get_proxies(self):

        get_log_config()

        proxy_model_list = []

        print('正在爬取快代理......')

        response = super(KuaidailiSpider, self).get_proxies()

        pattern = re.compile(
            '<tr>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>('
            '.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?</tr>',
            re.S
        )

        infos = re.findall(pattern, response.text)

        for item in infos:
            try:
                ip = item[0]
                port = item[1]
                anonymity = item[2]
                http_type = item[3]
                area = item[4]
                speed = item[5]

                print(ip + " | " + port + " | " + anonymity + " | " + http_type + " | " + area + " | " + speed)

                if http_type == 'HTTP' or http_type == 'HTTPS':
                    proxy = Proxy()
                    proxy.set_ip(ip)
                    proxy.set_port(port)
                    proxy.set_http_type(http_type.lower())
                    proxy.set_anonymity(anonymity)
                    proxy.set_area(area)
                    proxy.set_speed(speed)
                    proxy.set_agent(self.agent)
                    proxy.set_survival_time("")
                    proxy_model_list.append(proxy)
            except Exception as e:
                logging.debug(e)

        logging.debug(f"抓取 {self.agent} 网站共计 {len(proxy_model_list)} 个代理")

        return proxy_model_list
Beispiel #9
0
    def __init__(self):

        # 加载 Log 配置
        get_log_config()

        self.__proxy_table = 'proxy'

        self.conn = pymysql.connect(
            host=config.MYSQL_HOST,
            db=config.MYSQL_DBNAME,
            user=config.MYSQL_USER,
            passwd=config.MYSQL_PASSWORD,
            charset='utf8',  # 编码要加上,否则可能出现中文乱码问题
            use_unicode=False)

        with self.conn:
            self.cursor = self.conn.cursor()
Beispiel #10
0
    停止爬虫和代理池
    """

    def stop(self):
        self.isRunning = False
        # 关闭资源
        get_proxy_pool_worker().stop_work()

    """
    启动爬虫和代理池
    """

    def start(self):
        self.start_proxy_pool()
        self.start_spider()


if __name__ == '__main__':
    # 加载 Log 配置
    get_log_config()

    manager = SpiderManager()
    """
    Demospider() 是 Scrapy 项目 spider 目录下的爬虫脚本名字
    这里需要更换成 你项目的 爬虫名
    """
    spider_list = [
        PengpaiSpider(),
    ]
    manager.start()