def __init__(self, site): ''' 初始化 ''' self.conn = redisPool.getRedis() # 初始化一个redis的连接 # self.new_url = self.conn.sadd() self.new_url = site + '_new_urls' self.old_url = site + '_old_urls'
def __init__(self): self.connR = getRedis() #建立一个redis链接 self.proxies = { 'http': 'http://', 'https': 'https://' } #代理,之后有代理ip后,就直接用代理ip self.ips = 'ip_new' #存放爬取下来ip的集合名字 self.ip_able = 'ip_able' #存放可用集合
def able_ip(): found = False connR = getRedis() url = 'https://ip.cn/' while not found: proxy = connR.spop('ip_able') control() go_ip, port = proxy.split(':') proxy_test = {'http':'http://'+proxy,'https':'https://'+proxy} res = requests.get(url,headers=get_agent(),proxies=proxy_test,timeout=2) if res.status_code == 200: res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') script = soup.find('div', 'container-fluid').find_next('script').string ip = re.findall(r'\d+.\d+.\d+.\d+', script)[0] print('爬取到的ip值是:', ip, '当前代理ip是', go_ip) if ip == go_ip: print('########测试通过,当前ip是', go_ip) found = True return ip # ip = able_ip() # print(ip)
#encoding=utf-8 from django.test import TestCase # Create your tests here. from db import DataClass db_kanzhun = DataClass.DataClass('kanzhunDB') company = '荔枝微课' from db.redisPool import getRedis db = getRedis() # tags = db.smembers('boss_old_urls') # print(type(tags),tags) # for i in tags: # print(i) db_boss = DataClass.DataClass('bossDB', 'bossDB') data_boss = list( DataClass.DataClass('kanzhunDB').connM1[company].find( {'title': 'company'}, {'interviewDegree': 1})) # print(data_boss.count()) # for i in data_boss: # # print(i['content'][10]) # print('***********************************') # i['match'] = 1.1 # print(i) # li = list(data_boss) print('列表:&&&&&&&&&&&&&7', data_boss) for i in data_boss: print(i) # dblist = db_kanzhun.connM1.collection_names()
#coding=utf-8 ''' ************************* file: AnalysisJobs IPValidate author: gongyi date: 2019/7/19 11:41 **************************** change activity: 2019/7/19 11:41 ''' #检验ip是否可用 import telnetlib from db.redisPool import getRedis from job.spider.spiderHelper import get_agent connR = getRedis() def get_ip(): ''' 获取一个ip :return: ip ''' count = connR.scard('ip_new') print('当前有{}个待测ip'.format(count)) if connR.scard('ip_new') < 10: # 如果剩余待检测ip数量小于10,就异步启动爬虫协程 print('待测ip数量不足,开始爬取待测ip') pro.start() ip = connR.spop('ip_new') return ip