コード例 #1
0
ファイル: tieba3.py プロジェクト: pol9111/baidu_tieba_web
    def item_parse(self, response):

        each_page = response.xpath('//li[@class=" j_thread_list clearfix"]')

        for tiezi in each_page:
            item = SRedisItem()
            item['id'] = tiezi.xpath('./div/div[2]/div[1]/div[1]/a/@href'
                                     ).extract_first(default='N/A')
            item['title'] = tiezi.xpath(
                './/div[@class="threadlist_lz clearfix"]/div/a/text()'
            ).extract_first(default='N/A')
            item['author'] = tiezi.xpath(
                './/span[@class="tb_icon_author "]/@title|//span[@class="tb_icon_author no_icon_author"]/@title'
            ).re_first('主题作者:\s(.*)')
            item['create_time'] = tiezi.xpath(
                './/span[@class="pull-right is_show_create_time"]/text()'
            ).extract_first(default='N/A')
            item['reply_num'] = tiezi.xpath(
                './/span[@class="threadlist_rep_num center_text"]/text()'
            ).extract_first(default='N/A')
            item['last_reply'] = tiezi.xpath(
                './/span[@class="threadlist_reply_date pull_right j_reply_data"]/text()'
            ).re_first('\r\n\s*(\d+:\d+|\d+-\d+)\s*')
            item['content'] = tiezi.xpath(
                './div/div[2]/div[2]/div[1]/div/text()').extract_first(
                    default='N/A').strip()
            # content = tiezi.xpath('./div/div[2]/div[2]/div[1]/div/text()').extract_first(default='N/A')
            # item['content'] = re.sub('[\n\t\r\s]', '', content)

            yield item
コード例 #2
0
ファイル: tieba4.py プロジェクト: pol9111/baidu_tieba_web
    def parse(self, response):

        each_page = response.xpath('//li[@class=" j_thread_list clearfix"]')

        for tiezi in each_page:
            item = SRedisItem()
            item['id'] = tiezi.xpath('./div/div[2]/div[1]/div[1]/a/@href'
                                     ).extract_first(default='N/A')
            item['title'] = tiezi.xpath(
                './/div[@class="threadlist_lz clearfix"]/div/a/text()'
            ).extract_first(default='N/A')
            item['author'] = tiezi.xpath(
                './/span[@class="tb_icon_author "]/@title|//span[@class="tb_icon_author no_icon_author"]/@title'
            ).re_first('主题作者:\s(.*)')
            item['create_time'] = tiezi.xpath(
                './/span[@class="pull-right is_show_create_time"]/text()'
            ).extract_first(default='N/A')
            item['reply_num'] = tiezi.xpath(
                './/span[@class="threadlist_rep_num center_text"]/text()'
            ).extract_first(default='N/A')
            item['last_reply'] = tiezi.xpath(
                './/span[@class="threadlist_reply_date pull_right j_reply_data"]/text()'
            ).re_first('\r\n\s*(\d+:\d+|\d+-\d+)\s*')
            content = tiezi.xpath('./div/div[2]/div[2]/div[1]/div/text()'
                                  ).extract_first(default='N/A')
            item['content'] = re.sub('[\n\t\r\s]', '', content)
            yield item
        page = int(re.findall('pn=(\d+)', response.url)[0])
        if page:
            for i in range(page, page + 10000):
                yield scrapy.Request(url=self.base_url + str(i),
                                     callback=self.parse)