class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 :return: """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies2 = self.crawler.get_proxies(callback) print(proxies2) for i in proxies2: print('__________________________') self.redis.add(i)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ Check if the mount of proxies is over threshold.""" if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('Getter started.') if PRIVATE_PROXY_ENABLE: proxies = PrivateProxy().get_proxies() for proxy in proxies: print('Add private proxy {}'.format(proxy)) self.redis.add(proxy) else: if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): # 看不懂 # self.crawler.__CrawlFuncCount__ 获取所有以crawl开头的函数 for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
class Getter(object): """ 代理IP获取器 """ def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_limit(self): """ 检测是否超过代理的最大限制 :return: """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): """ 通过python定义的元类可以顺序执行以crawl_开头的函数 :return: """ print("获取器开始运行,爬取免费代理") if not self.is_over_limit(): for callback_label in range(self.crawler.__CrawlFuncCount__): # 执行获取代理的函数 callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy=proxy)
class GetterProxy(object): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 :return: """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print("获取器开始执行") if not self.is_over_threshold(): for callback_index in range(Crawler.__CrawlFuncCount__): # 获取方法 callback = self.crawler.__CrawlFunc__[callback_index] # 获取代理 proxies = self.crawler.get_proxies(callback) # 添加代理 for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush( ) #当我们打印一些字符时,并不是调用print函数后就立即打印的。一般会先将字符送到缓冲区,然后再打印。这就存在一个问题,如果你想立刻看到日志,但由于缓冲区没满,不会打印,sys.stdout.flush()立即把stdout缓存内容输出 for proxy in proxies: self.redis.add(proxy)
def handle_getter(self): """ 爬取代理 """ crawler = Crawler() client = RedisClient() while True: for proxy in crawler.start_crawl(): client.add(proxy) sleep(20)
class Getter(): def __init__(self): self.crawler = Crawler() self.redis = RedisClient() def run(self): if self.redis.count() < POOL_UPPER_THRESHOLD: #for crawl_func_label in range(self.crawler.__CrawlFuncCount__): for crawl_func in self.crawler.__CrawlFunc__: #crawl_func = self.crawler.__CrawlFunc__[crawl_func_label] proxies = self.crawler.start_crawl_func(crawl_func) print(crawl_func, '正在爬取代理') for proxy in proxies: print(proxy) self.redis.add(proxy) proxy_sum = self.redis.count() print('目前代理个数:', proxy_sum)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def log(self): if not os.path.exists('log2'): os.mkdir('log2') log_file_name = 'log2/' + LOG_PATH log_file_1 = logging.FileHandler(log_file_name, 'a', encoding='utf-8') fmt = logging.Formatter( fmt= "%(asctime)s - %(name)s - %(levelname)s -%(module)s: %(message)s") log_file_1.setFormatter(fmt) logger1 = logging.Logger('run_log', level=logging.DEBUG) logger1.addHandler(log_file_1) return logger1 def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): """爬取到代理设置初始分数,直接存入redis""" print('获取器开始执行') if not self.is_over_threshold(): try: for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() if not proxies: self.log().error('代理抓取失败,抓取函数:%s' % callback) continue for proxy in proxies: self.redis.add(proxy) except Exception as e: self.log().exception(e)
class Getter(object): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): if self.redis.get_count() < MAX_THRESHOLD: return True else: return False def run(self): if self.is_over_threshold(): for i in range(self.crawler.__CrawlCount__): proxies = self.crawler.get_proxies( self.crawler.__CrawlFunc__[i]) for proxy in proxies: self.redis.add(proxy)
class ValidityTester(object): #检查代理是否可用,并保存。 def __init__(self): self._raw_proxies=None def set_raw_proxies(self,proxies): self._raw_proxies=proxies self._redis=RedisClient() async def check_single_proxy(self,proxy): #检查单个代理 if isinstance(proxy,bytes): proxy=proxy.decode('utf-8') real_proxy='http://'+proxy try: async with aiohttp.ClientSession() as session: try: async with session.get(TEST_API,proxy=real_proxy,timeout=PROXY_TIMEOUT) as response: print('Check proxy',proxy) if response.status==200: self._redis.add(proxy) print('Add to redis',proxy) except (ProxyConnectionError,TimeoutError): print('Dont add to proxy',proxy) await session.close() except(ServerDisconnectedError, ClientResponseError,ClientConnectorError,Exception) as s: print(s) await session.close() def check_some_proxies(self): ''' 建立循环消息圈:循环检查_raw_proxies中的代理ip _raw_proxies 为空或者None 抛出异常 ''' if not self._raw_proxies: return try: print('Check_some_proxies Ing') loop=asyncio.get_event_loop() tasks=[self.check_single_proxy(task) for task in self._raw_proxies] loop.run_until_complete(asyncio.wait(tasks)) except: print('Check_some_proxies Error')
class Getter: def __init__(self): self.redis = RedisClient() self.crawler = CrawlerProxy() def is_over_threshold(self): if self.redis.count() > POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('*****获取器开始执行*****') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback=callback) for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_thershold(self): if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print("the getter programmer started!") if not self.is_over_thershold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() #判断是否达到了代理池限制 def is_over_threshold(self): if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for index in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[index] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy)
class Getter: def __init__(self): self._conn = RedisClient() self._crawler = Crawler() def is_over_threshold(self): if self._conn.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print("获取器开始运行") if not self.is_over_threshold(): for callback_index in range(self._crawler.__CrawlFuncCount__): callback = self._crawler.__CrawlFunc__[callback_index] proxies = self._crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self._conn.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): #循环遍历计数器、 for callback_label in range(self.crawler.__CrawlFuncCount__): #使用下表索引取出函数列表中的函数并进行运行 #爬虫程序返回的是一个迭代器 callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 try: ''' 函数get_proxies是对爬虫结果进行遍历并取出其中的值加入 代理列表并且返回代理 ''' proxies = self.crawler.get_proxies(callback) #proxies: list except Exception: print("\033[1;31;40m这里有错误...\033[0m") print(f'爬虫{callback.__name__}发生了错误,需要进行调试') #清除缓存使得结果连续输出 sys.stdout.flush() #遍历列表中的代理,加入数据库 for proxy in proxies: try: self.redis.add(proxy) except OSError as e: print(f"\033[1;31;40m发生错误...{e.reason}\033[0m")
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: if self.first_test(proxy): # 如果测试成功 print('添加代理:', proxy) self.redis.add(proxy) # 添加到队列 def first_test(self, proxy): print('筛选测试代理:', proxy) proxies = { "http": "http://{}".format(proxy), } try: r = requests.get(TEST_URL, proxies=proxies, timeout=4) if r.status_code == 200: return True except: print('测试失败,删除代理', proxy) return False
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到代理池的界限 """ if self.redis.count() >= POOL_UPPER_LIMIT: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) sys.stdout.flush() # 输出获取代理地址信息,类似print for proxy in proxies: self.redis.add(proxy)
class Getter(object): def __init__(self): """ 初始化数据库与爬虫 """ self.redis = RedisClient() self.crawler = Crawler() def is_full(self): """ 判断代理数目是否达上限 """ return self.redis.count() >= PROXY_NUMBER_MAX @logger.catch def run(self): logger.info('获取器开始执行......') if not self.is_full(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy)
class Getter(object): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ Judge whether the threshold has been reached. """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('Start excution') if self.is_over_threshold() is not None: for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
class Crawler(object, metaclass=ProxyMetaclass ): # 继承ProxyMetaclass, 拥有__CrawlFunc__, __CrawlFuncCount__两个属性 def __init__(self): self.redis = RedisClient() def save_to_db(self, proxies): for proxy in proxies: sys.stdout.flush() print('成功获取到代理', proxy) self.redis.add(proxy) async def crawl_daili66(self): """获取代理66, 外国ip多""" urls = [ 'http://www.66ip.cn/{}.html'.format(page) for page in range(1, 5) ] print('Crawling') html = await get_page(urls) if html: proxies = [] for page in html: doc = pq(page) trs = doc('.containerbox table tr:gt(0)').items() for tr in trs: ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() ip_port = ':'.join([ip, port]) proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_ip3366(self): """云代理index""" urls = [ 'http://www.ip3366.net/?stype=1&page={}'.format(page) for page in range(1, 5) ] html = await get_page(urls) if html: proxies = [] for page in html: find_tr = re.compile('<tr>(.*?)</tr>', re.S) trs = find_tr.findall(page) for s in range(1, len(trs)): find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>') re_ip_address = find_ip.findall(trs[s]) find_port = re.compile('<td>(\d+)</td>') re_port = find_port.findall(trs[s]) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port ip_port = address_port.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_ip3366_(self): """云代理free""" urls = [ 'http://www.ip3366.net/free/?stype=1&page={}'.format(page) for page in range(1, 5) ] html = await get_page(urls) if html: proxies = [] for page in html: ip_address = re.compile( '<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>') re_ip_address = ip_address.findall(page) # ip_port = [(re_ip_address[i] + ':' + re_port[i]).replace(' ', '') for i in range(len(re_port))] # proxies.append(ip_port) for address, port in re_ip_address: result = address + ':' + port ip_port = result.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_kuaidaili(self): """快代理(都是http的)""" urls = [ 'http://www.kuaidaili.com/free/inha/{}/'.format(page) for page in range(1, 5) ] html = await get_page(urls) if html: proxies = [] for page in html: ip_address = re.compile('<td data-title="IP">(.*?)</td>') re_ip_address = ip_address.findall(page) port = re.compile('<td data-title="PORT">(.*?)</td>') re_port = port.findall(page) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port ip_port = address_port.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_xicidaili(self): """西刺代理""" urls = [ 'http://www.xicidaili.com/nn/{}'.format(page) for page in range(1, 4) ] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3', 'Host': 'www.xicidaili.com', 'Referer': 'http://www.xicidaili.com/nn/3', 'Upgrade-Insecure-Requests': '1', } html = await get_page(urls, options=headers) if html: proxies = [] for page in html: find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S) trs = find_trs.findall(page) for tr in trs: find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>') re_ip_address = find_ip.findall(tr) find_port = re.compile('<td>(\d+)</td>') re_port = find_port.findall(tr) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port ip_port = address_port.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_iphai(self): """ip海""" urls = ['http://www.iphai.com/'] html = await get_page(urls) if html: proxies = [] find_tr = re.compile('<tr>(.*?)</tr>', re.S) trs = find_tr.findall(html[0]) for s in range(1, len(trs)): find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S) re_ip_address = find_ip.findall(trs[s]) find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S) re_port = find_port.findall(trs[s]) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port ip_port = address_port.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_data5u(self): """data5u""" urls = ['http://www.data5u.com/free/gngn/index.shtml'] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86', 'Host': 'www.data5u.com', 'Referer': 'http://www.data5u.com/free/index.shtml', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36', } html = await get_page(urls, options=headers) if html: proxies = [] ip_address = re.compile( '<span><li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class=\"port.*?>(\d+)</li>', re.S) re_ip_address = ip_address.findall(html[0]) for address, port in re_ip_address: result = address + ':' + port ip_port = result.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) # 近期修改的 async def crawl_goubanjia(self): """全网ip""" urls = ['http://www.goubanjia.com'] html = await get_page(urls) if html: proxies = [] doc = pq(html[0]) tds = doc('td.ip').items() for td in tds: td.find('p').remove() ip_port = td.text().replace('\n', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_89ip(self): """89ip""" urls = [ 'http://www.89ip.cn/index_{}.html'.format(page) for page in range(1, 4) ] html = await get_page(urls) if html: proxies = [] for page in html: doc = pq(page) ips = doc('tr td:nth-child(1)').items() ports = doc('tr td:nth-child(2)').items() for ip, port in zip(ips, ports): result = ip.text() + ':' + port.text() ip_port = result.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_ip181(self): """讯代理api接口""" urls = ['http://www.ip181.com/'] html = await get_page(urls) if html: proxies = [] json_ = eval(html[0]) RESULT = json_.get('RESULT') for i in RESULT: ip = i.get('ip') port = i.get('port') result = ip + ':' + port ip_port = result.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_premproxy(self): """premproxy""" urls = [ 'https://premproxy.com/proxy-by-country/{}.htm'.format(country) for country in ('China-01', 'China-02', 'China-03', 'China-04', 'Taiwan-01') ] html = await get_page(urls) if html: proxies = [] for page in html: ip_address = re.compile('<td data-label="IP:port ">(.*?)</td>') re_ip_address = ip_address.findall(page) for address_port in re_ip_address: ip_port = address_port.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_xroxy(self): """xroxy 换了网址不挂代理, 访问很慢""" urls = [ 'https://www.xroxy.com/proxy-country-{}'.format(country) for country in ('cn', 'tw') ] html = await get_page(urls) if html: proxies = [] for page in html: ip_address1 = re.compile( '<td class="sorting_1">(\d+\.\d+\.\d+\.\d+)</td>') re_ip_address1 = ip_address1.findall(page) print(re_ip_address1) ip_address2 = re.compile("<td>\d[3-5]</td>") re_ip_address2 = ip_address2.findall(page) print(re_ip_address2) for address, port in zip(re_ip_address1, re_ip_address2): address_port = address + ':' + port ip_port = address_port.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies)
class Getter(): crawler_list = [ "crawl_ip3366", "crawl_kuaidaili", "crawl_ip3366_new", "crawl_iphai", "crawl_data5u" ] def __init__(self): self.spider_log = logging.getLogger(GETTERLOGGER) self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self, mode=None): """ 判断是否达到了代理池限制 """ if mode is None: rediskey = REDIS_KEY else: rediskey = mode if self.redis.count(rediskey) >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): self.spider_log.info('获取器定时开始') httpflag = 0 httpsflag = 0 if not self.is_over_threshold(REDIS_HTTP): httpflag = 1 if not self.is_over_threshold(REDIS_HTTPS): httpsflag = 1 try: if httpflag == 1 or httpsflag == 1: self.spider_log.info("获取器开始执行,http:" + str(self.redis.count(REDIS_HTTP)) + ";https:" + str(self.redis.count(REDIS_HTTPS))) # if True: for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 if callback not in Getter.crawler_list: continue self.spider_log.info('开始获取:' + callback) proxies = self.crawler.get_proxies(callback) sys.stdout.flush() if httpflag == 1: for proxy in proxies: self.redis.add(proxy, mode=REDIS_HTTP) if httpsflag == 1: for proxy in proxies: self.redis.add(proxy, mode=REDIS_HTTPS) else: self.spider_log.info("获取器无需执行,http:" + str(self.redis.count(REDIS_HTTP)) + ";https:" + str(self.redis.count(REDIS_HTTPS))) except Exception as e: self.spider_log.error('获取器发生错误' + str(e.args)) self.spider_log.error('traceback:' + traceback.format_exc())
class Crawler(object, metaclass=ProxyMetaclass ): # 继承ProxyMetaclass, 拥有__CrawlFunc__, __CrawlFuncCount__两个属性 def __init__(self): self.redis = RedisClient() def save_to_db(self, proxies): for proxy in proxies: sys.stdout.flush() print('成功获取到代理', proxy) if not re.match("http\://\d+\.\d+\.\d+\.\d+\:\d+", proxy['https']): print('代理不符合规范', proxy['https'], '丢弃') else: self.redis.add(str(proxy)) async def crawl_daili66(self): """获取代理66, 外国ip多""" urls = [ 'http://www.66ip.cn/{}.html'.format(page) for page in range(1, 5) ] print('Crawling') html = await get_page(urls) if html: proxies = [] for page in html: doc = pq(page) trs = doc('.containerbox table tr:gt(0)').items() for tr in trs: ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() ip_port = ':'.join([ip, port]) proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) # async def crawl_ip3366(self): # """云代理index""" # urls = ['http://www.ip3366.net/?stype=1&page={}'.format(page) for page in range(1, 5)] # html = await get_page(urls) # if html: # proxies = [] # for page in html: # find_tr = re.compile('<tr>(.*?)</tr>', re.S) # trs = find_tr.findall(page) # for s in range(1, len(trs)): # find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>') # re_ip_address = find_ip.findall(trs[s]) # find_port = re.compile('<td>(\d+)</td>') # re_port = find_port.findall(trs[s]) # for address, port in zip(re_ip_address, re_port): # address_port = address+':'+port # ip_port = address_port.replace(' ','') # proxy = {'https':'http://'+ip_port} # proxies.append(proxy) # self.save_to_db(proxies) # async def crawl_ip3366_(self): """云代理free""" urls = [ 'http://www.ip3366.net/free/?stype=1&page={}'.format(page) for page in range(1, 5) ] html = await get_page(urls) if html: proxies = [] for page in html: ip_address = re.compile( '<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>') re_ip_address = ip_address.findall(page) # ip_port = [(re_ip_address[i] + ':' + re_port[i]).replace(' ', '') for i in range(len(re_port))] # proxies.append(ip_port) for address, port in re_ip_address: result = address + ':' + port ip_port = result.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies)
class Crawler(object, metaclass=ProxyMetaclass): # 继承ProxyMetaclass, 拥有__CrawlFunc__, __CrawlFuncCount__两个属性 def __init__(self): self.redis = RedisClient() def save_to_db(self, proxies): for proxy in proxies: sys.stdout.flush() print('成功获取到代理', proxy) self.redis.add(proxy) async def crawl_daili66(self): """获取代理66, 外国ip多""" urls = ['http://www.66ip.cn/{}.html'.format(page) for page in range(1, 5)] print('Crawling') html = await get_page(urls) if html: proxies = [] for page in html: doc = pq(page) trs = doc('.containerbox table tr:gt(0)').items() for tr in trs: ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() ip_port = ':'.join([ip, port]) proxy = {'https':'http://'+ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_ip3366(self): """云代理index""" urls = ['http://www.ip3366.net/?stype=1&page={}'.format(page) for page in range(1, 5)] html = await get_page(urls) if html: proxies = [] for page in html: find_tr = re.compile('<tr>(.*?)</tr>', re.S) trs = find_tr.findall(page) for s in range(1, len(trs)): find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>') re_ip_address = find_ip.findall(trs[s]) find_port = re.compile('<td>(\d+)</td>') re_port = find_port.findall(trs[s]) for address, port in zip(re_ip_address, re_port): address_port = address+':'+port ip_port = address_port.replace(' ','') proxy = {'https':'http://'+ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_ip3366_(self): """云代理free""" urls = ['http://www.ip3366.net/free/?stype=1&page={}'.format(page) for page in range(1, 5)] html = await get_page(urls) if html: proxies = [] for page in html: ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>') re_ip_address = ip_address.findall(page) # ip_port = [(re_ip_address[i] + ':' + re_port[i]).replace(' ', '') for i in range(len(re_port))] # proxies.append(ip_port) for address, port in re_ip_address: result = address+':'+ port ip_port = result.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_kuaidaili(self): """快代理(都是http的)""" urls = ['http://www.kuaidaili.com/free/inha/{}/'.format(page) for page in range(1, 5)] html = await get_page(urls) if html: proxies = [] for page in html: ip_address = re.compile('<td data-title="IP">(.*?)</td>') re_ip_address = ip_address.findall(page) port = re.compile('<td data-title="PORT">(.*?)</td>') re_port = port.findall(page) for address,port in zip(re_ip_address, re_port): address_port = address+':'+port ip_port = address_port.replace(' ','') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_xicidaili(self): """西刺代理""" urls = ['http://www.xicidaili.com/nn/{}'.format(page) for page in range(1, 4)] headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Cookie':'_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3', 'Host':'www.xicidaili.com', 'Referer':'http://www.xicidaili.com/nn/3', 'Upgrade-Insecure-Requests':'1', } html = await get_page(urls, options=headers) if html: proxies = [] for page in html: find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S) trs = find_trs.findall(page) for tr in trs: find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>') re_ip_address = find_ip.findall(tr) find_port = re.compile('<td>(\d+)</td>') re_port = find_port.findall(tr) for address,port in zip(re_ip_address, re_port): address_port = address+':'+port ip_port = address_port.replace(' ', '') proxy = {'https':'http://'+ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_iphai(self): """ip海""" urls = ['http://www.iphai.com/'] html = await get_page(urls) if html: proxies = [] find_tr = re.compile('<tr>(.*?)</tr>', re.S) trs = find_tr.findall(html[0]) for s in range(1, len(trs)): find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S) re_ip_address = find_ip.findall(trs[s]) find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S) re_port = find_port.findall(trs[s]) for address,port in zip(re_ip_address, re_port): address_port = address+':'+port ip_port = address_port.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_data5u(self): """data5u""" urls = ['http://www.data5u.com/free/gngn/index.shtml'] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86', 'Host': 'www.data5u.com', 'Referer': 'http://www.data5u.com/free/index.shtml', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36', } html = await get_page(urls, options=headers) if html: proxies = [] ip_address = re.compile('<span><li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class=\"port.*?>(\d+)</li>', re.S) re_ip_address = ip_address.findall(html[0]) for address, port in re_ip_address: result = address + ':' + port ip_port = result.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) # 近期修改的 async def crawl_goubanjia(self): """全网ip""" urls = ['http://www.goubanjia.com'] html = await get_page(urls) if html: proxies = [] doc = pq(html[0]) tds = doc('td.ip').items() for td in tds: td.find('p').remove() ip_port = td.text().replace('\n', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_89ip(self): """89ip""" urls = ['http://www.89ip.cn/index_{}.html'.format(page) for page in range(1, 4)] html = await get_page(urls) if html: proxies = [] for page in html: doc = pq(page) ips = doc('tr td:nth-child(1)').items() ports = doc('tr td:nth-child(2)').items() for ip, port in zip(ips, ports): result = ip.text() + ':' + port.text() ip_port = result.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_ip181(self): """讯代理api接口""" urls = ['http://www.ip181.com/'] html = await get_page(urls) if html: proxies = [] json_ = eval(html[0]) RESULT = json_.get('RESULT') for i in RESULT: ip = i.get('ip') port = i.get('port') result = ip + ':' + port ip_port = result.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_premproxy(self): """premproxy""" urls = ['https://premproxy.com/proxy-by-country/{}.htm'.format(country) for country in ('China-01','China-02','China-03','China-04','Taiwan-01')] html = await get_page(urls) if html: proxies = [] for page in html: ip_address = re.compile('<td data-label="IP:port ">(.*?)</td>') re_ip_address = ip_address.findall(page) for address_port in re_ip_address: ip_port = address_port.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_xroxy(self): """xroxy 换了网址不挂代理, 访问很慢""" urls = ['https://www.xroxy.com/proxy-country-{}'.format(country) for country in ('cn','tw')] html = await get_page(urls) if html: proxies = [] for page in html: ip_address1 = re.compile('<td class="sorting_1">(\d+\.\d+\.\d+\.\d+)</td>') re_ip_address1 = ip_address1.findall(page) print(re_ip_address1) ip_address2 = re.compile("<td>\d[3-5]</td>") re_ip_address2 = ip_address2.findall(page) print(re_ip_address2) for address,port in zip(re_ip_address1,re_ip_address2): address_port = address+':'+port ip_port = address_port.replace(' ','') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies)