def run(self): while self.start_page < self.page: # 发送请求,获取响应 response = parse_url(self.url, self.headers) # 处理响应结果 response = re.sub('encoding="UTF-8.36kr"', '', response) # 解析响应结果 title_list, content_url_list, information_list, next_page_url_list, page_max = self.parse_response(response) # 获得总页数 self.page = int(page_max[0]) for i in range(len(title_list)): # 发送请求获取帖子详情页数据 tie_response = self.get_tie_content(content_url_list[i], 0) # 获取详情页数据 tie_content, author, photo_url, information, reply_count, tie_next_page_url_list, tie_page_max = self.parse_tie_content( tie_response) # 保存数据 self.save_tie_reply(title_list[i], tie_content, author, photo_url, information, reply_count) print('帖子第1页以爬完') p = 1 pp = 2 # 获取详情页后面的数据 while p < tie_page_max: tie_response = self.get_tie_content(content_url_list[i], p) tie_content, author, photo_url, information, reply_count, tie_next_page_url_list, page_max = self.parse_tie_content( tie_response) print('帖子第%d页以爬完' % pp) pp += 1 p += 1 self.save_tie_reply(title_list[i], tie_content, author, photo_url, information, reply_count) self.url = 'http://jump.bdimg.com/mo/q----,sz@320_240-1-3---/m?kw=%E6%9F%AF%E5%8D%97&lp=7202' self.url += next_page_url_list[0] print('第%d页爬完' % self.start_page) self.start_page += 1
def run(self): while True: # parse_url是封装好requests的一个函数,传入url和headers等信息可以获取响应 response = parse_url(self.url, self.headers) response_dict = self.parse_response(response) self.download(response_dict) print('已下载15条热点')
def get_tie_content(self, content_url, i): # 获取详情页信息 p = int(i) * 30 # 拼接详情页分页url url = self.pares_url + content_url + '&pn=' + str(p) # 获取详情页数据 tie_response = parse_url(url, self.headers) # 处理详情页数据 tie_response = re.sub('encoding="UTF-8.36kr"', ' ', tie_response) return tie_response
def run(self): # parse_url是封装好requests的一个函数,传入url和headers等信息可以获取响应 response_str = parse_url(self.url, self.headers) #获取数据,翻页url response_dict, total_page = self.parse_content(response_str) # 下载数据 self.download(response_dict) print('第1页爬取完毕') page = 1 # 根据总页数翻页 while page < int(total_page): self.get_url(page) response_str = parse_url(self.url, self.headers) response_dict, total_page = self.parse_content(response_str) self.download(response_dict) page += 1 print('第%d页爬取完毕' % page)
def run(self): classify_list, original_list = get_classify(self.classify_url, self.headers) print(classify_list) for classify in classify_list: print('开始爬取%s分类的数据' % classify) self.get_url(classify) print(self.url) response_str = parse_url(self.url, self.headers) response_dict, total = self.parse_content(response_str) self.download(response_dict) print("第1页爬完..") i = 18 while i < total: self.get_url(classify, i) response_str = parse_url(self.url, self.headers) response_dict, total = self.parse_content(response_str) self.download(response_dict) i += 18 j = i / 18 print("第%d页爬完" % j)
def get_classify(classify_url, headers): classify_list = [] original_list = [] response = parse_url(classify_url, headers) # 解析html selector = etree.HTML(response) '''/html/body/div[2]/div[1]/section[5]/div/ul/li[1]/a''' pp = '// ul [ @class = "type-list"] / li / a//@href' temp = selector.xpath(pp) for i in temp: classify = re.match(r'/tv/(\w+)', i).group(1) original_list.append(classify) if classify == 'british': classify = 'english' elif classify == 'korean': classify = 'korean_drama' elif classify == 'chinese': classify = 'domestic' elif classify == 'tvshow': classify = 'variety' classify_list.append(classify) return classify_list, original_list
def run(self): # parse_url是封装好requests的一个函数,传入url和headers等信息可以获取响应 response = parse_url(self.url, self.headers) content_list = self.parse_response(response) for i in range(0, len(content_list)): print(content_list[i])