Ejemplo n.º 1
0
 def parse(self, response):
     _root = response.xpath("//tbody/tr")
     for _tr in _root:
         #帖子创建日期
         type = _tr.xpath("th/em/a/text()").extract_first()
         title = _tr.xpath("th/a/text()").extract_first()
         cdate = _tr.xpath("td[@class='by']/em/span/text()").extract_first()
         if cdate is None:
             self.info("cdate is None")
             #self.debug(_tr.extract())
             continue
         if  timeutil.strdt_datetime(cdate,ft='%Y-%m-%d %H:%M') > self.last_date:
             try:
                 uid = _tr.xpath("td[@class='by']/cite/a/@href").re("uid=(\d+)").pop()
                 tid = _tr.xpath("th/a/@href").re("tid=(\d+)").pop()
                 if uid is None or tid is None:
                     self.info("title:%s %s;uid or tid is None" % (type,title))
                     #self.debug(_tr.extract())
                     continue
                 _thread_url = self.url_pattern  % {'tid':tid,'uid':uid}
                 url = response.urljoin(_thread_url)
                 #self.info("wait crawl url:%s from %s" % (url,response.url))
                 yield scrapy.Request(url, callback=self.parse_articles_follow_next_page,meta={'postdate':timeutil.strdt_datetime(cdate,ft='%Y-%m-%d %H:%M')})
             except Exception as e:
                 util.exc_info()
                 self.debug(_tr.extract())
     nextpage = response.xpath("//a[@class='nxt']/@href").extract_first()
     if nextpage is not None:
         url = response.urljoin(nextpage)
         self.info("list url->%s"% url)
         yield scrapy.Request(url, callback=self.parse)
Ejemplo n.º 2
0
    def parse_articles_follow_next_page(self, response):
        _item = crawldata()
        _item['url'] = response.url
        
        _title = response.xpath("//span[@id='thread_subject']/text()").extract_first()
        _item['title'] = _title
        _tag = response.xpath("//h1[@class='ts']/a/text()").extract_first()
        _item['tag'] = _tag
        try:
            _item['postdate'] = response.meta['postdate']
        except Exception as e:
            util.exc_info()


        _root = response.xpath("//div[@id='postlist']/div[starts-with(@id,'post_')]/table/tr/td[@class='plc']/div[@class='pct']/div[@class='pcb']/div[@class='t_fsz']") 
        _message = []
        for _root_item in _root:
            _second_root = _root_item.xpath("table/tr/td/child::node()") 
            for _second_item in _second_root:
                _node_type = _second_item.xpath("name()").extract_first()
                if _node_type is None:
                    _message.extend(_second_item.extract())
                    _message.append("\n")
                elif _node_type == "ignore_js_op":
                    _img_url = _second_item.xpath("div//img/@file").extract_first()
                    if _img_url is not None:
                        _message.extend(response.urljoin(_img_url))
                        _message.append("\n")
 
            #抽取img,类似这种格式http://hzbike.com/forum.php?mod=viewthread&tid=118823&page=1&authorid=22591
            _img_list = _root_item.xpath("div[@class='pattl']/ignore_js_op")
            for _img in _img_list:
                _img_url = _img.xpath(".//img/@file").extract_first()
                if _img_url is not None:
                    _img_desc = _img.xpath(".//p[@class='mbn xg2']/text()").extract_first()
                    if _img_desc is not None:
                        _message.extend(_img_desc)
                        _message.append("\n")
                    _message.extend(response.urljoin(_img_url))
                    _message.append("\n")
        _item['data'] = "".join(_message).encode("utf8")
        yield _item

        next_page = response.xpath("//div[@class='pgt']/div[@class='pg']/a[@class='nxt']/@href")
        if next_page:
            url = response.urljoin(next_page.extract_first())
            yield scrapy.Request(url, self.parse_articles_follow_next_page)