Example #1
0
    def __init__(self):

        self.__first = True
        # 连接数据库
        self.dbmanager = ProxyDBManager()
        # 创建数据库表
        self.dbmanager.create_proxy_table()
Example #2
0
def case_10():

    dao = ProxyDBManager()
    dao.create_proxy_table()

    proxy = ProxyModel()

    ip = '125.115.141.6'
    port = 8118
    http_type = 'HTTPS'
    anonymity = '高匿'
    area = '浙江宁波'
    speed = '0.148秒'
    agent = 'agent'
    survival_time = '4小时'

    proxy.set_ip(ip)
    proxy.set_port(port)
    proxy.set_type(http_type)
    proxy.set_anonymity(anonymity)
    # 处理空地区
    if area is None:
        proxy.set_area('')
    else:
        proxy.set_area(area)
    proxy.set_speed(speed)
    proxy.set_agent(agent)
    proxy.set_survival_time(survival_time)

    dao.insert_proxy_table(proxy)

    proxy_address = dao.select_random_proxy()
    print(proxy_address)

    if 'http://' in proxy_address:
        proxy_address = proxy_address.replace('http://', '')
    else:
        proxy_address = proxy_address.replace('https://', '')

    old_ip = proxy_address.split(':')[0]
    print('old IP : ', old_ip)
    dao.plus_proxy_faild_time(old_ip)
Example #3
0
class ProxyPoolWorker(object):

    __MIN_PROXY_NUM = 15

    def __init__(self):

        self.__first = True
        # 连接数据库
        self.dbmanager = ProxyDBManager()
        # 创建数据库表
        self.dbmanager.drop_proxy_table()
        self.dbmanager.create_proxy_table()

    """ 
    把 ProxyPoolWorker 实现为单例 
    """

    def __new__(cls, *args, **kwargs):
        if not hasattr(cls, '__instance'):
            new = super(ProxyPoolWorker, cls)
            cls.__instance = new.__new__(cls, *args)
        return cls.__instance

    """ 
    开始爬取 IP 代理 
    """

    def start_work(self):
        self.crawl_proxy_web()

        scheduler = BackgroundScheduler()  # 后台调度器
        # 后台每 10 秒执行一次
        # scheduler.add_job(self.__timedTask, 'interval', seconds=10)
        # 后台每 10 分钟执行一次
        scheduler.add_job(self.__check_ip_availability_task,
                          'interval',
                          minutes=10)
        scheduler.start()

    """ 
    检查 IP 是否可用 
    """

    def __check_ip_availability_task(self):
        pass

    def crawl_proxy_web(self):

        spiders = [
            XiciSpider,
            Data5uSpider,
            KuaidailiSpider,
        ]

        # for spider in spiders:
        # 修改为随机抓取某个代理网站
        spider = random.choice(spiders)
        models = spider.get_proxies()
        filtered_models = requestEnginer.filter_unavailable_proxy(models)
        for each in filtered_models:
            self.dbmanager.insert_proxy_table(each)

    """
    随机获取一个 IP 代理地址
    """

    def select_proxy_data(self):
        proxy = self.dbmanager.select_random_proxy()
        if proxy is not '':
            proxy = self.dbmanager.select_random_proxy()
            return proxy

    """
    代理地址失效, 数据库连接失败次数 +1
    """

    def plus_proxy_faild_time(self, ip):
        self.dbmanager.plus_proxy_faild_time(ip)

    """
    停止爬取 IP 代理
    """

    def stop_work(self):
        self.dbmanager.close_connection()