def parse_item(self, response): page_num = self.get_page_num_from_url(response.url) crawl_time = date_str_now_ymd() #当前时间 if response.status != 200: print '#' * 100 print str(response.status) print '#' * 100 print str(response.body) print '#' * 100 #todo yield handle_captcha(self, response) selector = Selector(response) duan_list = [] for dz_selector in selector.xpath("//div[@class='row']"): dz_selector = Selector(text=dz_selector.extract()) duan_list.append( self.gen_new_duanzi_item_by_selector(dz_selector) ) yield self.gen_new_duanzi_item_by_selector(dz_selector) next_url = "".join(selector.xpath( "/html/body/div[@id='wrapper']/div[@id='body']/div[@id='content']/div[@id='comments']/div[@class='comments'][1]/div[@class='cp-pagenavi']/a[@class='previous-comment-page']/@href" ).extract()) if next_url: yield self.gen_next_request(next_url) print 50*'*' pass
def check_and_save_morethan100zan_duanzi(floor, content): if floor is not None and content is not None: todaystr = date_str_now_ymd() if not r.hexists(duanzi_more_than_100_zan_floor_hash, floor): r.hset(duanzi_more_than_100_zan_floor_hash, floor, True) r.lpush(duanzi_more_than_100_zan_list_pre + todaystr, content) pass pass