Example #1
0
    def parse_item(self, response):

        page_num = self.get_page_num_from_url(response.url)
        crawl_time = date_str_now_ymd() #当前时间

        if response.status != 200:
            print '#' * 100
            print str(response.status)
            print '#' * 100
            print str(response.body)
            print '#' * 100
            #todo yield handle_captcha(self, response)

        selector = Selector(response)

        duan_list = []

        for dz_selector in selector.xpath("//div[@class='row']"):
            dz_selector = Selector(text=dz_selector.extract())
            duan_list.append( self.gen_new_duanzi_item_by_selector(dz_selector) )
            yield self.gen_new_duanzi_item_by_selector(dz_selector)

        next_url = "".join(selector.xpath(
            "/html/body/div[@id='wrapper']/div[@id='body']/div[@id='content']/div[@id='comments']/div[@class='comments'][1]/div[@class='cp-pagenavi']/a[@class='previous-comment-page']/@href"
        ).extract())

        if next_url:
            yield self.gen_next_request(next_url)

        print 50*'*'
        pass
def check_and_save_morethan100zan_duanzi(floor, content):
    if floor is not None  and content is not None:
        todaystr = date_str_now_ymd()
        if not r.hexists(duanzi_more_than_100_zan_floor_hash, floor):
            r.hset(duanzi_more_than_100_zan_floor_hash, floor,  True)
            r.lpush(duanzi_more_than_100_zan_list_pre + todaystr, content)
        pass
    pass