def get_task(funcc): coll = RedisClient() task = 'None' r_len = coll.queue_len # print(r_len) if r_len == 0: print('you have aready crawl all urls') else: try: task = coll.rpop() # print(task) funcc(task) except: coll.lpush(task)
class MasterSpider: def __init__(self,start_url): self.start_url=start_url #数据量过大时会发生内存溢出 #self.fi=set() self.headers={"User-Agent":"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"} ''' 初始化连接redis ''' self.start_url=START_URL self.db=RedisClient(REDIS_HOST,REDIS_PORT,REDIS_DB,REDIS_DOMAIN,REDIS_NAME) self.db1=RedisClient(REDIS_HOST,REDIS_PORT,REDIS_DB,REDIS_DOMAIN="weibo",REDIS_NAME="HomeUrl") self.db2=RedisClient(REDIS_HOST,REDIS_PORT,REDIS_DB,REDIS_DOMAIN="weibo",REDIS_NAME="lfid") def new_cookies(self): cookies = Cookies_Pool().parse_cookies() return cookies def parse(self,pass_url): #解析当前微博下的所有评论用户 first_req=requests.get(pass_url+str(1),cookies=self.new_cookies()).content if 'not exist' in str(first_req): return None html = etree.HTML(first_req) #获取中断的页面 try: with open('page_num.txt','r') as f: broken_page_num=int(f.readlines()[0])+1 except: broken_page_num=1 #评论总页数 try: page_num = (html.xpath('//*[@id="pagelist"]/form/div/text()')[1].split('/')[1])[:-1] except: #print('[-----]页面请求错误') return self.parse(pass_url=pass_url) for page in range(broken_page_num,int(page_num)+1): print(page) if page % 5 == 0: with open('page_num.txt','w') as f: f.write(str(page)) fi=set() #保存当前运行状态 cookies=self.new_cookies() #print('[++++++++]当前cookies:',str(cookies)) try: req=requests.get(pass_url+str(page),cookies=cookies,headers={"User-Agent":UserAgent().random}).content html=etree.HTML(req) fans = html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/@href') fans_name=html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/text()') except: while True: #print('[!!!!!]出现错误,未获取到内容:') time.sleep(5) try: req = requests.get(pass_url + str(page),headers={"User-Agent":UserAgent().random},cookies=cookies).content html = etree.HTML(req) fans = html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/@href') fans_name = html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/text()') break except: pass for i,j in enumerate(fans): #防止底部返回链接的干扰 if '5644764907' in j: continue fans_url='https://weibo.cn/'+j.split('/u/')[1]+'/info' fans_weibo='https://weibo.cn'+j m_url="https://m.weibo.cn/api/container/getIndex?containerid=230283{}_-_INFO&title=%E5%9F%BA%E6%9C%AC%E8%B5%84%E6%96%99&luicode=10000011&lfid=230283{}".format(j.split('/u/')[1],j.split('/u/')[1]) name=fans_name[i] if name in fi: pass else: fi.add(name) self.db.lpush(fans_url) self.db1.lpush(fans_weibo) self.db2.lpush(m_url) print('[+++][+++][+++]',name) #在应对限制ip的反爬措施中,效率最高的等待时间 time.sleep(0.35) #爬完该篇微博的所有评论后 time.sleep(1) with open('page_num.txt','w') as f: f.write('0') def open_url(self): #获取微博主页页码 req = requests.get(url=self.start_url, cookies=self.new_cookies()) html = etree.HTML(req.content) page_num = (html.xpath('//*[@id="pagelist"]/form/div/text()')[1].split('/')[1])[:-1] #print('[+]微博主页数:',str(page_num)) for page in range(1,int(int(page_num)/2+1)): page_url=self.start_url.split('=')[0]+'='+str(page) #print('[+][+]当前所在主页链接:',page_url req = requests.get(url=page_url, cookies=self.new_cookies()) soup = BeautifulSoup(req.content, 'lxml') #获取到当前微博页面上的所有文章链接 passages = soup.find_all(attrs={"class": "cc"}) #print('[+][+][+]该页面的微博文章数:',str(len(passages))) #循环每一篇微博评论区地址 for passage in passages: with open(r'current_url.txt', 'w') as f: f.write(page_url+'\n'+passage.get('href')) pass_url=passage.get('href').replace('#cmtfrm', '&page=') #print('[+][+][+][+]当前文章的第一页评论页面模板',pass_url) self.parse(pass_url) time.sleep(120) def broken_start(self,start_url,passage_url): #print('passage_url',passage_url) # 获取微博主页页码 req = requests.get(url=self.start_url, cookies=self.new_cookies()) html = etree.HTML(req.content) page_num = (html.xpath('//*[@id="pagelist"]/form/div/text()')[1].split('/')[1])[:-1] # 从中断处开始 for page in range(int(start_url.split('=')[1]),int(int(page_num)/2+1)): page_url = self.start_url.split('=')[0] + '=' + str(page) # print('[+][+]当前所在主页链接:',page_url) req = requests.get(url=page_url, cookies=self.new_cookies()) soup = BeautifulSoup(req.content, 'lxml') # 获取到当前微博页面上的所有文章链接 passages = soup.find_all(attrs={"class": "cc"}) # print('[+][+][+]该页面的微博文章数:',str(len(passages))) # 循环每一篇微博评论区 try: passage_list=[x.get('href') for x in passages] index=passage_list.index(passage_url.replace('\n','')) print('index',str(index)) except Exception as msg: print(msg) index=0 for passage in passages[index:]: with open(r'current_url.txt', 'w') as f: f.write(page_url+"\n"+passage.get('href')) pass_url = passage.get('href').replace('#cmtfrm', '&page=') # print('[+][+][+][+]当前文章的第一页评论页面模板',pass_urli) self.parse(pass_url) time.sleep(120)