class Parser: def __init__(self): self.list = list() self.comments = [] # 评论结果 self.comments_userurl = [] # 从评论里面爬取到的用户url self.likes = [] # 赞爬取结果 self.queue = Queue() # 爬取队列对象 self.db = DB() def parse_numberOfTweet(self, html): # 获取每一页微博的url dic = {} soup = BeautifulSoup(html, 'lxml') base_url = soup.find('td').contents[0]['href'] url = re.search('/[\d*]{1,}', base_url).group() + "?page=" base_page = soup.find('span', attrs={"class": "tc"}).text page_amount = int( re.search('[\d]{1,}', base_page, re.M | re.I).group()) / 10 + 1 dic["url"] = "https://weibo.cn/u" + url dic["page_amount"] = page_amount self.queue.add_url_tweets(dic) # TODO:解析整个页面 def parse_fpage(self, html): # 获取评论页面url soup = BeautifulSoup(html, 'lxml') for i in soup.find_all('a', attrs={"class": "cc"}): if "原文评论" in i.text: pass else: dic = {} dic["url"] = re.search('https://[a-z]{1,}.cn/comment/.*[?]', i["href"], re.M | re.I).group() + "page=" num = int(re.search('[\d]+', i.text).group()) if num == 0: pass else: dic["page_amount"] = num / 10 + 1 print(dic) self.queue.add_url_comments(dic) # TODO:解析评论页面 def parse_comments(self, html): # 获取评论页面的内容 soup = BeautifulSoup(html, "lxml") self.comments = [] # 将评论list清空 self.comments_userurl = [] # 把用户urllist清空 like_dic = {} comments_html = soup.find_all('div', id=re.compile("C_")) host_name = soup.find("a", attrs={"class": ""}).text[2:-3] like_soup = soup.find('a', attrs={'href': re.compile('/atti')}) like_dic['url'] = "https://weibo.cn" + str( re.search("/attitu[\w]{1,}/[\w]{1,}\?", like_soup['href']).group()) + "page=" like_dic['page_amount'] = int( re.search('[\d]+', like_soup.text).group()) / 10 + 1 print(like_dic) self.queue.add_url_likes(like_dic) for content in comments_html: comment_dict = dict() comment_dict["comment_content"] = content.find(attrs={ "class": "ctt" }).text # 获取评论内容 comment_dict["comment_username"] = content.find( "a").text # 获取评论者的昵称 comment_user_url = content.find("a").attrs["href"] # 获取评论者的链接 self.queue.add_url_host("https://weibo.cn" + comment_user_url) # 取评论用户url做为爬取队列 print(comment_dict, comment_user_url) self.comments.append(comment_dict) self.db.insert(host_name, self.comments) # 插入数据库 # TODO:解析点赞页面 def parse_likes(self, html): # 获取点赞页面的内容 self.likes = [] likes = [] soup = BeautifulSoup(html, "lxml") host_name = soup.find("div", attrs={"class": ""}).text.split(":")[0] n = 0 for i in soup.find_all(attrs={"class": "ct"}): n += 1 if n == 1: continue else: likes.append(i.previous_sibling.previous_sibling.text) self.db.insert_likes(host_name, likes)