Exemple #1
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        :return:
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                proxies2 = self.crawler.get_proxies(callback)
                print(proxies2)
                for i in proxies2:
                    print('__________________________')
                    self.redis.add(i)
Exemple #2
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """ Check if the mount of proxies is over threshold."""
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('Getter started.')
        if PRIVATE_PROXY_ENABLE:
            proxies = PrivateProxy().get_proxies()
            for proxy in proxies:
                print('Add private proxy {}'.format(proxy))
                self.redis.add(proxy)
        else:
            if not self.is_over_threshold():
                for callback_label in range(self.crawler.__CrawlFuncCount__):
                    callback = self.crawler.__CrawlFunc__[callback_label]
                    # 获取代理
                    proxies = self.crawler.get_proxies(callback)
                    sys.stdout.flush()
                    for proxy in proxies:
                        self.redis.add(proxy)
Exemple #3
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            # 看不懂
            # self.crawler.__CrawlFuncCount__ 获取所有以crawl开头的函数
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    self.redis.add(proxy)
Exemple #4
0
class Getter(object):
    """
    代理IP获取器
    """
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_limit(self):
        """
        检测是否超过代理的最大限制
        :return:
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        """
        通过python定义的元类可以顺序执行以crawl_开头的函数
        :return:
        """
        print("获取器开始运行,爬取免费代理")
        if not self.is_over_limit():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                # 执行获取代理的函数
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                for proxy in proxies:
                    self.redis.add(proxy=proxy)
Exemple #5
0
class GetterProxy(object):
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        :return:
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print("获取器开始执行")
        if not self.is_over_threshold():
            for callback_index in range(Crawler.__CrawlFuncCount__):
                # 获取方法
                callback = self.crawler.__CrawlFunc__[callback_index]
                # 获取代理
                proxies = self.crawler.get_proxies(callback)
                # 添加代理
                for proxy in proxies:
                    self.redis.add(proxy)
Exemple #6
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush(
                )  #当我们打印一些字符时,并不是调用print函数后就立即打印的。一般会先将字符送到缓冲区,然后再打印。这就存在一个问题,如果你想立刻看到日志,但由于缓冲区没满,不会打印,sys.stdout.flush()立即把stdout缓存内容输出
                for proxy in proxies:
                    self.redis.add(proxy)
Exemple #7
0
 def handle_getter(self):
     """
     爬取代理
     """
     crawler = Crawler()
     client = RedisClient()
     while True:
         for proxy in crawler.start_crawl():
             client.add(proxy)
         sleep(20)
Exemple #8
0
class Getter():
    def __init__(self):
        self.crawler = Crawler()
        self.redis = RedisClient()

    def run(self):
        if self.redis.count() < POOL_UPPER_THRESHOLD:
            #for crawl_func_label in range(self.crawler.__CrawlFuncCount__):
            for crawl_func in self.crawler.__CrawlFunc__:
                #crawl_func = self.crawler.__CrawlFunc__[crawl_func_label]
                proxies = self.crawler.start_crawl_func(crawl_func)
                print(crawl_func, '正在爬取代理')
                for proxy in proxies:
                    print(proxy)
                    self.redis.add(proxy)
        proxy_sum = self.redis.count()
        print('目前代理个数:', proxy_sum)
Exemple #9
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def log(self):
        if not os.path.exists('log2'):
            os.mkdir('log2')
        log_file_name = 'log2/' + LOG_PATH
        log_file_1 = logging.FileHandler(log_file_name, 'a', encoding='utf-8')
        fmt = logging.Formatter(
            fmt=
            "%(asctime)s - %(name)s - %(levelname)s -%(module)s:  %(message)s")
        log_file_1.setFormatter(fmt)
        logger1 = logging.Logger('run_log', level=logging.DEBUG)
        logger1.addHandler(log_file_1)

        return logger1

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        """爬取到代理设置初始分数,直接存入redis"""
        print('获取器开始执行')
        if not self.is_over_threshold():
            try:
                for callback_label in range(self.crawler.__CrawlFuncCount__):
                    callback = self.crawler.__CrawlFunc__[callback_label]
                    # 获取代理
                    proxies = self.crawler.get_proxies(callback)
                    sys.stdout.flush()
                    if not proxies:
                        self.log().error('代理抓取失败,抓取函数:%s' % callback)
                        continue
                    for proxy in proxies:
                        self.redis.add(proxy)
            except Exception as e:
                self.log().exception(e)
Exemple #10
0
class Getter(object):
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        if self.redis.get_count() < MAX_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        if self.is_over_threshold():
            for i in range(self.crawler.__CrawlCount__):
                proxies = self.crawler.get_proxies(
                    self.crawler.__CrawlFunc__[i])
                for proxy in proxies:
                    self.redis.add(proxy)
Exemple #11
0
class ValidityTester(object):
    #检查代理是否可用,并保存。
    def __init__(self):
        self._raw_proxies=None
    def set_raw_proxies(self,proxies):
        self._raw_proxies=proxies
        self._redis=RedisClient()
    async def check_single_proxy(self,proxy):
        #检查单个代理
        if isinstance(proxy,bytes):
            proxy=proxy.decode('utf-8')
        real_proxy='http://'+proxy
        try:
            async with aiohttp.ClientSession() as session:

                try:
                    async with session.get(TEST_API,proxy=real_proxy,timeout=PROXY_TIMEOUT) as response:
                        print('Check proxy',proxy)
                        if response.status==200:
                            self._redis.add(proxy)
                            print('Add to redis',proxy)
                except (ProxyConnectionError,TimeoutError):
                    print('Dont add to proxy',proxy)
                    await session.close()

        except(ServerDisconnectedError, ClientResponseError,ClientConnectorError,Exception) as s:
            print(s)
            await session.close()

    def check_some_proxies(self):
        '''
        建立循环消息圈:循环检查_raw_proxies中的代理ip
        _raw_proxies 为空或者None 抛出异常
        '''
        if not self._raw_proxies:
            return
        try:
            print('Check_some_proxies Ing')
            loop=asyncio.get_event_loop()
            tasks=[self.check_single_proxy(task) for task in self._raw_proxies]
            loop.run_until_complete(asyncio.wait(tasks))
        except:
            print('Check_some_proxies Error')
Exemple #12
0
class Getter:
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = CrawlerProxy()

    def is_over_threshold(self):
        if self.redis.count() > POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('*****获取器开始执行*****')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback=callback)
                for proxy in proxies:
                    self.redis.add(proxy)
Exemple #13
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_thershold(self):
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print("the getter programmer started!")
        if not self.is_over_thershold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                for proxy in proxies:
                    self.redis.add(proxy)
Exemple #14
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    #判断是否达到了代理池限制
    def is_over_threshold(self):
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for index in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[index]
                proxies = self.crawler.get_proxies(callback)
                for proxy in proxies:
                    self.redis.add(proxy)
Exemple #15
0
class Getter:
    def __init__(self):
        self._conn = RedisClient()
        self._crawler = Crawler()

    def is_over_threshold(self):
        if self._conn.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print("获取器开始运行")
        if not self.is_over_threshold():
            for callback_index in range(self._crawler.__CrawlFuncCount__):
                callback = self._crawler.__CrawlFunc__[callback_index]
                proxies = self._crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    self._conn.add(proxy)
Exemple #16
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            #循环遍历计数器、
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                #使用下表索引取出函数列表中的函数并进行运行
                #爬虫程序返回的是一个迭代器
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                try:
                    '''
                    函数get_proxies是对爬虫结果进行遍历并取出其中的值加入
                    代理列表并且返回代理
                    '''
                    proxies = self.crawler.get_proxies(callback)
                    #proxies: list
                except Exception:
                    print("\033[1;31;40m这里有错误...\033[0m")
                    print(f'爬虫{callback.__name__}发生了错误,需要进行调试')
                #清除缓存使得结果连续输出
                sys.stdout.flush()
                #遍历列表中的代理,加入数据库
                for proxy in proxies:
                    try:
                        self.redis.add(proxy)
                    except OSError as e:
                        print(f"\033[1;31;40m发生错误...{e.reason}\033[0m")
Exemple #17
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    if self.first_test(proxy):  # 如果测试成功
                        print('添加代理:', proxy)
                        self.redis.add(proxy)  # 添加到队列

    def first_test(self, proxy):
        print('筛选测试代理:', proxy)
        proxies = {
            "http": "http://{}".format(proxy),
        }
        try:
            r = requests.get(TEST_URL, proxies=proxies, timeout=4)
            if r.status_code == 200:
                return True
        except:
            print('测试失败,删除代理', proxy)
            return False
Exemple #18
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到代理池的界限
        """
        if self.redis.count() >= POOL_UPPER_LIMIT:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()  # 输出获取代理地址信息,类似print
                for proxy in proxies:
                    self.redis.add(proxy)
Exemple #19
0
class Getter(object):
    def __init__(self):
        """
        初始化数据库与爬虫
        """
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_full(self):
        """
        判断代理数目是否达上限
        """
        return self.redis.count() >= PROXY_NUMBER_MAX

    @logger.catch
    def run(self):
        logger.info('获取器开始执行......')
        if not self.is_full():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                for proxy in proxies:
                    self.redis.add(proxy)
Exemple #20
0
class Getter(object):
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        Judge whether the threshold has been reached.
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('Start excution')
        if self.is_over_threshold() is not None:
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    self.redis.add(proxy)
Exemple #21
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()
    
    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False
    
    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    self.redis.add(proxy)
Exemple #22
0
class Crawler(object, metaclass=ProxyMetaclass
              ):  # 继承ProxyMetaclass, 拥有__CrawlFunc__, __CrawlFuncCount__两个属性
    def __init__(self):
        self.redis = RedisClient()

    def save_to_db(self, proxies):
        for proxy in proxies:
            sys.stdout.flush()
            print('成功获取到代理', proxy)
            self.redis.add(proxy)

    async def crawl_daili66(self):
        """获取代理66, 外国ip多"""
        urls = [
            'http://www.66ip.cn/{}.html'.format(page) for page in range(1, 5)
        ]
        print('Crawling')
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                doc = pq(page)
                trs = doc('.containerbox table tr:gt(0)').items()
                for tr in trs:
                    ip = tr.find('td:nth-child(1)').text()
                    port = tr.find('td:nth-child(2)').text()
                    ip_port = ':'.join([ip, port])
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_ip3366(self):
        """云代理index"""
        urls = [
            'http://www.ip3366.net/?stype=1&page={}'.format(page)
            for page in range(1, 5)
        ]
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                find_tr = re.compile('<tr>(.*?)</tr>', re.S)
                trs = find_tr.findall(page)
                for s in range(1, len(trs)):
                    find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>')
                    re_ip_address = find_ip.findall(trs[s])
                    find_port = re.compile('<td>(\d+)</td>')
                    re_port = find_port.findall(trs[s])
                    for address, port in zip(re_ip_address, re_port):
                        address_port = address + ':' + port
                        ip_port = address_port.replace(' ', '')
                        proxy = {'https': 'http://' + ip_port}
                        proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_ip3366_(self):
        """云代理free"""
        urls = [
            'http://www.ip3366.net/free/?stype=1&page={}'.format(page)
            for page in range(1, 5)
        ]
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                ip_address = re.compile(
                    '<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
                re_ip_address = ip_address.findall(page)
                # ip_port = [(re_ip_address[i] + ':' + re_port[i]).replace(' ', '') for i in range(len(re_port))]
                # proxies.append(ip_port)
                for address, port in re_ip_address:
                    result = address + ':' + port
                    ip_port = result.replace(' ', '')
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_kuaidaili(self):
        """快代理(都是http的)"""
        urls = [
            'http://www.kuaidaili.com/free/inha/{}/'.format(page)
            for page in range(1, 5)
        ]
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                ip_address = re.compile('<td data-title="IP">(.*?)</td>')
                re_ip_address = ip_address.findall(page)
                port = re.compile('<td data-title="PORT">(.*?)</td>')
                re_port = port.findall(page)
                for address, port in zip(re_ip_address, re_port):
                    address_port = address + ':' + port
                    ip_port = address_port.replace(' ', '')
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_xicidaili(self):
        """西刺代理"""
        urls = [
            'http://www.xicidaili.com/nn/{}'.format(page)
            for page in range(1, 4)
        ]
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Cookie':
            '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3',
            'Host': 'www.xicidaili.com',
            'Referer': 'http://www.xicidaili.com/nn/3',
            'Upgrade-Insecure-Requests': '1',
        }
        html = await get_page(urls, options=headers)
        if html:
            proxies = []
            for page in html:
                find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S)
                trs = find_trs.findall(page)
                for tr in trs:
                    find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>')
                    re_ip_address = find_ip.findall(tr)
                    find_port = re.compile('<td>(\d+)</td>')
                    re_port = find_port.findall(tr)
                    for address, port in zip(re_ip_address, re_port):
                        address_port = address + ':' + port
                        ip_port = address_port.replace(' ', '')
                        proxy = {'https': 'http://' + ip_port}
                        proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_iphai(self):
        """ip海"""
        urls = ['http://www.iphai.com/']
        html = await get_page(urls)
        if html:
            proxies = []
            find_tr = re.compile('<tr>(.*?)</tr>', re.S)
            trs = find_tr.findall(html[0])
            for s in range(1, len(trs)):
                find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>',
                                     re.S)
                re_ip_address = find_ip.findall(trs[s])
                find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S)
                re_port = find_port.findall(trs[s])
                for address, port in zip(re_ip_address, re_port):
                    address_port = address + ':' + port
                    ip_port = address_port.replace(' ', '')
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_data5u(self):
        """data5u"""
        urls = ['http://www.data5u.com/free/gngn/index.shtml']
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
            'Cache-Control':
            'max-age=0',
            'Connection':
            'keep-alive',
            'Cookie':
            'JSESSIONID=47AA0C887112A2D83EE040405F837A86',
            'Host':
            'www.data5u.com',
            'Referer':
            'http://www.data5u.com/free/index.shtml',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
        }
        html = await get_page(urls, options=headers)
        if html:
            proxies = []
            ip_address = re.compile(
                '<span><li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class=\"port.*?>(\d+)</li>',
                re.S)
            re_ip_address = ip_address.findall(html[0])
            for address, port in re_ip_address:
                result = address + ':' + port
                ip_port = result.replace(' ', '')
                proxy = {'https': 'http://' + ip_port}
                proxies.append(proxy)
            self.save_to_db(proxies)

# 近期修改的

    async def crawl_goubanjia(self):
        """全网ip"""
        urls = ['http://www.goubanjia.com']
        html = await get_page(urls)
        if html:
            proxies = []
            doc = pq(html[0])
            tds = doc('td.ip').items()
            for td in tds:
                td.find('p').remove()
                ip_port = td.text().replace('\n', '')
                proxy = {'https': 'http://' + ip_port}
                proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_89ip(self):
        """89ip"""
        urls = [
            'http://www.89ip.cn/index_{}.html'.format(page)
            for page in range(1, 4)
        ]
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                doc = pq(page)
                ips = doc('tr td:nth-child(1)').items()
                ports = doc('tr td:nth-child(2)').items()
                for ip, port in zip(ips, ports):
                    result = ip.text() + ':' + port.text()
                    ip_port = result.replace(' ', '')
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_ip181(self):
        """讯代理api接口"""
        urls = ['http://www.ip181.com/']
        html = await get_page(urls)
        if html:
            proxies = []
            json_ = eval(html[0])
            RESULT = json_.get('RESULT')
            for i in RESULT:
                ip = i.get('ip')
                port = i.get('port')
                result = ip + ':' + port
                ip_port = result.replace(' ', '')
                proxy = {'https': 'http://' + ip_port}
                proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_premproxy(self):
        """premproxy"""
        urls = [
            'https://premproxy.com/proxy-by-country/{}.htm'.format(country)
            for country in ('China-01', 'China-02', 'China-03', 'China-04',
                            'Taiwan-01')
        ]
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                ip_address = re.compile('<td data-label="IP:port ">(.*?)</td>')
                re_ip_address = ip_address.findall(page)
                for address_port in re_ip_address:
                    ip_port = address_port.replace(' ', '')
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_xroxy(self):
        """xroxy 换了网址不挂代理, 访问很慢"""
        urls = [
            'https://www.xroxy.com/proxy-country-{}'.format(country)
            for country in ('cn', 'tw')
        ]
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                ip_address1 = re.compile(
                    '<td class="sorting_1">(\d+\.\d+\.\d+\.\d+)</td>')
                re_ip_address1 = ip_address1.findall(page)
                print(re_ip_address1)
                ip_address2 = re.compile("<td>\d[3-5]</td>")
                re_ip_address2 = ip_address2.findall(page)
                print(re_ip_address2)
                for address, port in zip(re_ip_address1, re_ip_address2):
                    address_port = address + ':' + port
                    ip_port = address_port.replace(' ', '')
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)
Exemple #23
0
class Getter():
    crawler_list = [
        "crawl_ip3366", "crawl_kuaidaili", "crawl_ip3366_new", "crawl_iphai",
        "crawl_data5u"
    ]

    def __init__(self):
        self.spider_log = logging.getLogger(GETTERLOGGER)
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self, mode=None):
        """
        判断是否达到了代理池限制
        """
        if mode is None:
            rediskey = REDIS_KEY
        else:
            rediskey = mode

        if self.redis.count(rediskey) >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        self.spider_log.info('获取器定时开始')
        httpflag = 0
        httpsflag = 0
        if not self.is_over_threshold(REDIS_HTTP):
            httpflag = 1
        if not self.is_over_threshold(REDIS_HTTPS):
            httpsflag = 1
        try:
            if httpflag == 1 or httpsflag == 1:
                self.spider_log.info("获取器开始执行,http:" +
                                     str(self.redis.count(REDIS_HTTP)) +
                                     ";https:" +
                                     str(self.redis.count(REDIS_HTTPS)))
                # if True:
                for callback_label in range(self.crawler.__CrawlFuncCount__):
                    callback = self.crawler.__CrawlFunc__[callback_label]
                    # 获取代理
                    if callback not in Getter.crawler_list:
                        continue
                    self.spider_log.info('开始获取:' + callback)
                    proxies = self.crawler.get_proxies(callback)
                    sys.stdout.flush()
                    if httpflag == 1:
                        for proxy in proxies:
                            self.redis.add(proxy, mode=REDIS_HTTP)
                    if httpsflag == 1:
                        for proxy in proxies:
                            self.redis.add(proxy, mode=REDIS_HTTPS)
            else:
                self.spider_log.info("获取器无需执行,http:" +
                                     str(self.redis.count(REDIS_HTTP)) +
                                     ";https:" +
                                     str(self.redis.count(REDIS_HTTPS)))

        except Exception as e:
            self.spider_log.error('获取器发生错误' + str(e.args))
            self.spider_log.error('traceback:' + traceback.format_exc())
Exemple #24
0
class Crawler(object, metaclass=ProxyMetaclass
              ):  # 继承ProxyMetaclass, 拥有__CrawlFunc__, __CrawlFuncCount__两个属性
    def __init__(self):
        self.redis = RedisClient()

    def save_to_db(self, proxies):
        for proxy in proxies:
            sys.stdout.flush()
            print('成功获取到代理', proxy)
            if not re.match("http\://\d+\.\d+\.\d+\.\d+\:\d+", proxy['https']):
                print('代理不符合规范', proxy['https'], '丢弃')
            else:
                self.redis.add(str(proxy))

    async def crawl_daili66(self):
        """获取代理66, 外国ip多"""
        urls = [
            'http://www.66ip.cn/{}.html'.format(page) for page in range(1, 5)
        ]
        print('Crawling')
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                doc = pq(page)
                trs = doc('.containerbox table tr:gt(0)').items()
                for tr in trs:
                    ip = tr.find('td:nth-child(1)').text()
                    port = tr.find('td:nth-child(2)').text()
                    ip_port = ':'.join([ip, port])
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)

#     async def crawl_ip3366(self):
#         """云代理index"""
#         urls = ['http://www.ip3366.net/?stype=1&page={}'.format(page) for page in range(1, 5)]
#         html = await get_page(urls)
#         if html:
#             proxies = []
#             for page in html:
#                 find_tr = re.compile('<tr>(.*?)</tr>', re.S)
#                 trs = find_tr.findall(page)
#                 for s in range(1, len(trs)):
#                     find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>')
#                     re_ip_address = find_ip.findall(trs[s])
#                     find_port = re.compile('<td>(\d+)</td>')
#                     re_port = find_port.findall(trs[s])
#                     for address, port in zip(re_ip_address, re_port):
#                         address_port = address+':'+port
#                         ip_port = address_port.replace(' ','')
#                         proxy = {'https':'http://'+ip_port}
#                         proxies.append(proxy)
#             self.save_to_db(proxies)
#

    async def crawl_ip3366_(self):
        """云代理free"""
        urls = [
            'http://www.ip3366.net/free/?stype=1&page={}'.format(page)
            for page in range(1, 5)
        ]
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                ip_address = re.compile(
                    '<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
                re_ip_address = ip_address.findall(page)
                # ip_port = [(re_ip_address[i] + ':' + re_port[i]).replace(' ', '') for i in range(len(re_port))]
                # proxies.append(ip_port)
                for address, port in re_ip_address:
                    result = address + ':' + port
                    ip_port = result.replace(' ', '')
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)
Exemple #25
0
class Crawler(object, metaclass=ProxyMetaclass): # 继承ProxyMetaclass, 拥有__CrawlFunc__, __CrawlFuncCount__两个属性
    def __init__(self):
        self.redis = RedisClient()

    def save_to_db(self, proxies):
        for proxy in proxies:
            sys.stdout.flush()
            print('成功获取到代理', proxy)
            self.redis.add(proxy)

    async def crawl_daili66(self):
        """获取代理66, 外国ip多"""
        urls = ['http://www.66ip.cn/{}.html'.format(page) for page in range(1, 5)]
        print('Crawling')
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                doc = pq(page)
                trs = doc('.containerbox table tr:gt(0)').items()
                for tr in trs:
                    ip = tr.find('td:nth-child(1)').text()
                    port = tr.find('td:nth-child(2)').text()
                    ip_port = ':'.join([ip, port])
                    proxy = {'https':'http://'+ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_ip3366(self):
        """云代理index"""
        urls = ['http://www.ip3366.net/?stype=1&page={}'.format(page) for page in range(1, 5)]
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                find_tr = re.compile('<tr>(.*?)</tr>', re.S)
                trs = find_tr.findall(page)
                for s in range(1, len(trs)):
                    find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>')
                    re_ip_address = find_ip.findall(trs[s])
                    find_port = re.compile('<td>(\d+)</td>')
                    re_port = find_port.findall(trs[s])
                    for address, port in zip(re_ip_address, re_port):
                        address_port = address+':'+port
                        ip_port = address_port.replace(' ','')
                        proxy = {'https':'http://'+ip_port}
                        proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_ip3366_(self):
        """云代理free"""
        urls = ['http://www.ip3366.net/free/?stype=1&page={}'.format(page) for page in range(1, 5)]
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
                re_ip_address = ip_address.findall(page)
                # ip_port = [(re_ip_address[i] + ':' + re_port[i]).replace(' ', '') for i in range(len(re_port))]
                # proxies.append(ip_port)
                for address, port in re_ip_address:
                    result = address+':'+ port
                    ip_port = result.replace(' ', '')
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_kuaidaili(self):
        """快代理(都是http的)"""
        urls = ['http://www.kuaidaili.com/free/inha/{}/'.format(page) for page in range(1, 5)]
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                ip_address = re.compile('<td data-title="IP">(.*?)</td>')
                re_ip_address = ip_address.findall(page)
                port = re.compile('<td data-title="PORT">(.*?)</td>')
                re_port = port.findall(page)
                for address,port in zip(re_ip_address, re_port):
                    address_port = address+':'+port
                    ip_port = address_port.replace(' ','')
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_xicidaili(self):
        """西刺代理"""
        urls = ['http://www.xicidaili.com/nn/{}'.format(page) for page in range(1, 4)]
        headers = {
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Cookie':'_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3',
            'Host':'www.xicidaili.com',
            'Referer':'http://www.xicidaili.com/nn/3',
            'Upgrade-Insecure-Requests':'1',
        }
        html = await get_page(urls, options=headers)
        if html:
            proxies = []
            for page in html:
                find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S)
                trs = find_trs.findall(page)
                for tr in trs:
                    find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>')
                    re_ip_address = find_ip.findall(tr)
                    find_port = re.compile('<td>(\d+)</td>')
                    re_port = find_port.findall(tr)
                    for address,port in zip(re_ip_address, re_port):
                        address_port = address+':'+port
                        ip_port = address_port.replace(' ', '')
                        proxy = {'https':'http://'+ip_port}
                        proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_iphai(self):
        """ip海"""
        urls = ['http://www.iphai.com/']
        html = await get_page(urls)
        if html:
            proxies = []
            find_tr = re.compile('<tr>(.*?)</tr>', re.S)
            trs = find_tr.findall(html[0])
            for s in range(1, len(trs)):
                find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S)
                re_ip_address = find_ip.findall(trs[s])
                find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S)
                re_port = find_port.findall(trs[s])
                for address,port in zip(re_ip_address, re_port):
                    address_port = address+':'+port
                    ip_port = address_port.replace(' ', '')
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_data5u(self):
        """data5u"""
        urls = ['http://www.data5u.com/free/gngn/index.shtml']
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86',
            'Host': 'www.data5u.com',
            'Referer': 'http://www.data5u.com/free/index.shtml',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
        }
        html = await get_page(urls, options=headers)
        if html:
            proxies = []
            ip_address = re.compile('<span><li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class=\"port.*?>(\d+)</li>', re.S)
            re_ip_address = ip_address.findall(html[0])
            for address, port in re_ip_address:
                result = address + ':' + port
                ip_port = result.replace(' ', '')
                proxy = {'https': 'http://' + ip_port}
                proxies.append(proxy)
            self.save_to_db(proxies)

# 近期修改的

    async def crawl_goubanjia(self):
        """全网ip"""
        urls = ['http://www.goubanjia.com']
        html = await get_page(urls)
        if html:
            proxies = []
            doc = pq(html[0])
            tds = doc('td.ip').items()
            for td in tds:
                td.find('p').remove()
                ip_port = td.text().replace('\n', '')
                proxy = {'https': 'http://' + ip_port}
                proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_89ip(self):
        """89ip"""
        urls = ['http://www.89ip.cn/index_{}.html'.format(page) for page in range(1, 4)]
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                doc = pq(page)
                ips = doc('tr td:nth-child(1)').items()
                ports = doc('tr td:nth-child(2)').items()
                for ip, port in zip(ips, ports):
                    result = ip.text() + ':' + port.text()
                    ip_port = result.replace(' ', '')
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_ip181(self):
        """讯代理api接口"""
        urls = ['http://www.ip181.com/']
        html = await get_page(urls)
        if html:
            proxies = []
            json_ = eval(html[0])
            RESULT = json_.get('RESULT')
            for i in RESULT:
                ip = i.get('ip')
                port = i.get('port')
                result = ip + ':' + port
                ip_port = result.replace(' ', '')
                proxy = {'https': 'http://' + ip_port}
                proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_premproxy(self):
        """premproxy"""
        urls = ['https://premproxy.com/proxy-by-country/{}.htm'.format(country) for country in ('China-01','China-02','China-03','China-04','Taiwan-01')]
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                ip_address = re.compile('<td data-label="IP:port ">(.*?)</td>')
                re_ip_address = ip_address.findall(page)
                for address_port in re_ip_address:
                    ip_port = address_port.replace(' ', '')
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_xroxy(self):
        """xroxy 换了网址不挂代理, 访问很慢"""
        urls = ['https://www.xroxy.com/proxy-country-{}'.format(country) for country in ('cn','tw')]
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                ip_address1 = re.compile('<td class="sorting_1">(\d+\.\d+\.\d+\.\d+)</td>')
                re_ip_address1 = ip_address1.findall(page)
                print(re_ip_address1)
                ip_address2 = re.compile("<td>\d[3-5]</td>")
                re_ip_address2 = ip_address2.findall(page)
                print(re_ip_address2)
                for address,port in zip(re_ip_address1,re_ip_address2):
                    address_port = address+':'+port
                    ip_port = address_port.replace(' ','')
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)