Example #1
0
    def parse_content(self, response):
        title = response.css('.biaoti h1 span font::text').get()
        if title is None:
            title = response.url
        # 防止文章标题出现非法字符
        title = tools.reshape_title(title)

        content = response.css('.zuo_nr').get()
        soup = bs(content, 'lxml')
        soup.find(class_='biaoti').extract()
        content = soup.prettify()
        # 清除字体格式,图片
        content = tools.reshape_content(content)

        path = tools.reshape_path(self.name)

        item = items.HedespiderItem()
        item['title'] = title
        item['content'] = content
        item['path'] = path
        item['userid'] = self.userid
        if len(self.keywords) == 0:
            yield item
        for keyword in self.keywords:
            if keyword in str(item):
                yield item
                break
Example #2
0
    def parse_content(self, response):
        title = response.css('.tit::text').get()
        if title is None:
            title = response.url
        # 防止文章标题出现非法字符
        title = tools.reshape_title(title)

        content = response.css('.content').get()
        # 清除字体格式,图片
        content = tools.reshape_content(content)

        path = tools.reshape_path(self.name)

        item = items.HedespiderItem()
        item['title'] = title
        item['content'] = content
        item['path'] = path
        item['userid'] = self.userid
        if len(self.keywords) == 0:
            yield item
        for keyword in self.keywords:
            if keyword in str(item):
                yield item
                break