Ejemplo n.º 1
0
class QbSpiderSpider(scrapy.Spider):
    name = 'qb_spider'
    allowed_domains = ['qiushibaike.com']
    start_urls = ['https://www.qiushibaike.com/text/page/1/']
    item = QiushibaikeItem()

    def parse(self, response):
        duanzidivs = response.xpath("//div[@class='col1 old-style-col1']/div")
        for duanzidiv in duanzidivs:
            author = duanzidiv.xpath(".//h2/text()").get().strip()
            self.item['author'] = author
            # 解决内容显示不完整的段子:先获取到完整段子的链接,request访问,处理交给parse_all_content函数
            if duanzidiv.xpath(".//div[@class='content']/span[2]"):
                all_link = duanzidiv.xpath("./a/@href").get()
                content_url = "https://www.qiushibaike.com" + all_link
                yield scrapy.Request(url=content_url,
                                     callback=self.parse_all_content)
            else:
                content = "".join(
                    duanzidiv.xpath(".//div[@class='content']/span//text()").
                    getall()).strip()
                self.item['content'] = content
            yield self.item
        # 爬取多页共二十页,也可以判断页面有没有下一页如果有则爬取
        for i in range(2, 21):
            next_url = "https://www.qiushibaike.com/text/page/" + str(i) + "/"
            yield scrapy.Request(url=next_url, callback=self.parse)

    def parse_all_content(self, response):
        all_content = response.xpath(
            "//div[@class='content']//text()").getall()
        all_content = ''.join(all_content).strip()
        self.item['content'] = all_content
        yield self.item
Ejemplo n.º 2
0
    def get_page(self,response):
        test = BeautifulSoup(response.text,'lxml')
        #print (test.prettify())
        articles = BeautifulSoup(response.text,'lxml').find_all('div',class_='article block untagged mb15')

        print ('现在是第'+str(response.meta['page'])+'页')
        for article in articles:
            item = QiushibaikeItem()
            id = article.get('id')[11:]
            name = article.find('div',class_='author clearfix').find('h2').get_text()
            content = article.find('div',class_='content').find('span').get_text()

            agreed_number = article.find('span',class_='stats-vote').find('i').get_text()

            item['page_number'] = str(response.meta['page'])
            item['id'] = id
            item['name'] = name
            item['content'] = content
            item['agreed_number'] = agreed_number


            #print(item)
            #discuss_number =
            # print ('-----------------------------------------')
            # print('id:' + id)
            # print ('姓名: '+name.strip())
            # print ('内容: '+content.strip())
            # print ('点赞: '+agreed_number.strip())
            #
            # print('-----------------------------------------')

            yield item
Ejemplo n.º 3
0
    def parse(self, response):
        print('\nstart  {} ......\n'.format(response.url))
        content_left_div = response.xpath('//*[@id="content-left"]')
        content_list_div = content_left_div.xpath('./div')

        for content_div in content_list_div:
            try:
                item = QiushibaikeItem()
                try:
                    item['author'] = content_div.xpath(
                        './div/a[2]/h2/text()').get().strip()
                except:
                    item['author'] = content_div.xpath(
                        './div/span[2]/h2/text()').get().strip()
                # item['content'] = "".join(content_div.xpath('./a/div[@class="content"]/span/text()').getall()).strip().replace('\n', '')
                item['content'] = "".join(
                    content_div.xpath(
                        './a[contains(@href, "article")]/div[@class="content"]/span/text()'
                    ).getall()).strip().replace('\n', '')
                item['_id'] = content_div.attrib['id']
                yield item
            except Exception as e:
                print(response.url)
                print("item error", e.args)

        next_page = response.xpath(
            '//*[@id="content-left"]/ul/li[last()]/a/@href').get()

        if next_page:
            next_page = 'https://www.qiushibaike.com' + next_page
            yield scrapy.Request(url=next_page, callback=self.parse)
Ejemplo n.º 4
0
    def parse(self, response):
        list = response.xpath('//*[@id="content-left"]/div')
        for info in list:
            item = QiushibaikeItem()
            username = info.xpath('.//div[1]/a[2]/h2/text()').extract_first()
            if username:
                item['username'] = username.replace('\n', '')
                item['avatar'] = response.urljoin(
                    info.xpath('.//div[1]/a[1]/img/@src').extract_first())
                item['content'] = info.xpath(
                    './/a[1]/div/span[1]/text()').extract_first().replace(
                        ' ', '').replace('\n', '')
                contentImage = info.xpath(
                    './/div[2]/a/img/@src').extract_first()
                if contentImage:
                    item['contentImage'] = response.urljoin(contentImage)
                else:
                    item['contentImage'] = ''
                item['nextPage'] = response.urljoin(
                    response.xpath(
                        "//ul[@class='pagination']/li[last()]/a/@href").
                    extract_first())
                yield item

        nextPage = response.xpath(
            "//ul[@class='pagination']/li[last()]/a/@href").extract_first()
        if nextPage:
            nextUrl = response.urljoin(nextPage)
            yield scrapy.Request(nextUrl, self.parse)
Ejemplo n.º 5
0
 def parse(self, response):
     # print(response)
     div_lists = response.xpath('//*[@id="content"]/div/div[2]/div')
     # print(div_lists)
     all_data = []
     for div in div_lists:
         #选取列表第一个元素转化为哦字符串
         # author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
         #直接选取第一个元素转换为字符串
         author = div.xpath(
             './div[1]/a[2]/h2/text() | ./div[1]/span/h2/text()'
         ).extract_first()
         #返回一个列表字符串
         page_text = div.xpath('./a[1]/div/span/text() ').extract()
         #使用join方法将列表转换为字符串
         page_text = ''.join(page_text)
         # print(author,page_text)
         #基于终端存储
         #     dic = {
         #         'author':author,
         #         'page_text':page_text
         #     }
         #     all_data.append(dic)
         # return all_data
         #基于管道
         item = QiushibaikeItem()
         item['author'] = author
         item['page_text'] = page_text
         #将item提交到管道
         yield item
Ejemplo n.º 6
0
 def parse_item(self, response):
     for sel in response.xpath(
             '//div[@class="article block untagged mb15"]'):
         item = QiushibaikeItem()
         item['author'] = sel.xpath('.//h2/text()')[0].extract()
         item['duanzi'] = sel.xpath(
             'div[@class="content"]/text()').extract()
         yield item
Ejemplo n.º 7
0
    def parse_item(self, response):

        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        for sel in response.xpath('//div[@class="article block untagged mb15"]'):
            item=QiushibaikeItem()
            item['author']=sel.xpath('.//h2/text()')[0].extract()
            item['duanzi']=sel.xpath('div[@class="content"]/text()').extract()
            yield item
Ejemplo n.º 8
0
 def parse_item(self, response):
     i = QiushibaikeItem()
     i['content'] = response.xpath(
         '//div[@class="content"]/text()').extract()
     i['link'] = response.xpath('//link[@rel="canonical"]/@href').extract()
     print(i['content'])
     print(i['link'])
     print('')
     #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
     #i['name'] = response.xpath('//div[@id="name"]').extract()
     #i['description'] = response.xpath('//div[@id="description"]').extract()
     return i
Ejemplo n.º 9
0
 def parse(self, response):
     content_left_node = response.xpath(
         "//div[@id='content-left']")  # 确定发布区的节点区域
     div_node_list = content_left_node.xpath("./div")
     crawl_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     for div_node in div_node_list:
         item = QiushibaikeItem()
         l = QItemLoader(item, selector=div_node)
         l.add_xpath(
             'name',
             ".//div[@class='author clearfix']/a[contains(@onclick,'web-list-author-text')]/h2/text()",
         )
         l.add_xpath('info', ".//div[@class='content']/span[1]//text()")
         l.add_value('crawl_date', crawl_date)
         data = l.load_item()
         print(data)
         yield data
Ejemplo n.º 10
0
    def parse(self, response):
        content_left_div = response.xpath(
            '// *[ @ id = "content"] / div / div[2]')
        content_list_div = content_left_div.xpath('./div')
        for content_div in content_list_div:
            item = QiushibaikeItem()
            item['author'] = content_div.xpath(
                './div[1]/a[2]/h2/text()').get(),
            item['content'] = content_div.xpath(
                './a[1]/div/span/text()').getall(),
            item['_id'] = content_div.attrib['id']
            yield item

        next_page = response.xpath(
            '//*[@id="content"]/div/div[2]/ul/li[8]/a').attrib['href']
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse())
Ejemplo n.º 11
0
    def parse(self, response):

        content_left_div = response.xpath('//*[@class="col1 old-style-col1"]')
        content_list_div = content_left_div.xpath('./div')

        for content_div in content_list_div:
            item = QiushibaikeItem()
            item['author'] = content_div.xpath('./div/a[2]/h2/text()').get()
            item['content'] = content_div.xpath('./a/div/span/text()').getall()
            item['_id'] = content_div.attrib['id']
            yield item
        try:
            next_page = response.xpath(
                '//*[@class="col1 old-style-col1"]/ul/li[last()]/a'
            ).attrib['href']
            yield response.follow(next_page, callback=self.parse)
        except KeyError:
            print("key error")
Ejemplo n.º 12
0
 def parse(self, response):
     str=""
     item=QiushibaikeItem()
     for  qiushi in response.xpath('//div[@class="content-block clearfix"]/div[@id="content-left"]/div'):
         if len(qiushi.xpath('.//div[@class="author clearfix"]/a[2]/h2/text()')) is 0:
             item['author']="匿名用户"
         else:
             a=[x.strip() for x in qiushi.xpath('.//div[@class="author clearfix"]/a[2]/h2/text()').extract()]
             b=[x.strip() for x in qiushi.xpath('.//a/div[@class="content"]/span/text()').extract()]
             c=[x.strip() for x in qiushi.xpath('.//div[@class="stats"]/span[1]//text()').getall()]
             d=[x.strip() for x in qiushi.xpath('.//div[@class="stats"]/span[2]/a//text()').getall()]
             item['author']=str.join(a)
             item['content']=str.join(b)
             item['fullName']=str.join(c)
             item['sum_comment']=str.join(d)
             yield item
     next_page = response.xpath('//ul[@class="pagination"]/li[last()]/a/@href').extract()
     if next_page:
         next_page='https://www.qiushibaike.com'+next_page[0]
         yield Request(next_page,headers=self.headers,callback=self.parse,dont_filter=True)
Ejemplo n.º 13
0
    def parse(self, response):
        for item in response.xpath(
                '//div[@id="content-left"]/div[@class="article block untagged mb15"]'
        ):
            qiubai = QiushibaikeItem()

            icon = item.xpath(
                './div[@class="author clearfix"]/a[1]/img/@src').extract()
            if icon:
                icon = icon[0]
                qiubai['userIcon'] = icon
            userName = item.xpath(
                './div[@class="author clearfix"]/a[2]/h2/text()').extract()
            if userName:
                userName = userName[0]
                qiubai['userName'] = userName

            content = item.xpath(
                './a[@class="contentHerf"]/div[@class="content"]/span/descendant::text()'
            ).extract()
            if content:
                con = ''
                for str in content:
                    con += str
                qiubai['content'] = con

            like = item.xpath(
                './div[@class="stats"]/span[@class="stats-vote"]/i/text()'
            ).extract()
            if like:
                like = like[0]
                qiubai['like'] = like

            comment = item.xpath(
                './div[@class="stats"]/span[@class="stats-comments"]/a/i/text()'
            ).extract()
            if comment:
                comment = comment[0]
                qiubai['comment'] = comment

            yield qiubai
Ejemplo n.º 14
0
    def parse(self, response):
        # 解析: 作者名称,段子内容
        div_list = response.xpath('//*[@id="content"]/div/div[2]/div')
        # 存储所有解析到的数据
        all_data = []
        for div in div_list:
            # xpath返回的是列表,但是列表的元素一定是Selector类型对象
            # extract可以将Selector对象中的data参数存储的字符串提取出来,list存储
            # author = div.xpath('.//h2/text()').extract()[0]
            author = div.xpath('.//h2/text()').extract_first().strip()
            # 列表调用了extract之后,则表示将列表中每一个Selector对象中data对应的字符串提取出来,存储到列表当中
            content = div.xpath('./a/div/span/text()')[0].extract().strip()
            content = ''.join(content)
            # print(author, content)

            item = QiushibaikeItem()
            item['author'] = author
            item['content'] = content

            # 提交item到管道
            yield item
Ejemplo n.º 15
0
    def parse(self, response):
        content_left_div = response.xpath("//*[@id='content']/div/div[2]")
        content_list_div = content_left_div.xpath('./div')

        for content_div in content_list_div:
            item = QiushibaikeItem()
            item['author'] = content_div.xpath('./div/a[2]/h2/text()').get(),
            item['content'] = content_div.xpath('./a/div/span/text()').getall()
            item['_id'] = content_div.attrib['id']
            yield item

        next_page = response.xpath(
            '/html/body/div[1]/div/div[2]/ul/li[8]/a').attrib['href']

        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)


# items.py:定义我们要存储数据的字段。
# middlewares.py:就是中间件,在这里面可以做一些在爬虫过程中想干的事情,比如爬虫在响应的时候你可以做一些操作。
# pipelines.py:用来定义一些存储信息的文件,比如我们要连接 MySQL或者 MongoDB 就可以在这里定义。
# settings.py:定义我们的各种配置,比如配置请求头信息等。
Ejemplo n.º 16
0
 def parse(self, response):
     # selectorList
     divs = response.xpath('//div[@class="col1 old-style-col1"]/div')
     for div in divs:
         # selector
         author = div.xpath('.//h2/text()').get().strip()
         content = div.xpath('.//div[@class="content"]//text()').getall()
         content = ",".join(content).strip()
         # duanzi = {
         #     'author': author,
         #     'content': content
         # }
         # yield duanzi
         item = QiushibaikeItem(author=author, content=content)
         yield item
     next_url = response.xpath(
         "//ul[@class='pagination']/li[last()]/a/@href").get()
     if not next_url:
         return
     else:
         yield scrapy.Request(self.base_domain + next_url,
                              callback=self.parse)
Ejemplo n.º 17
0
    def parse(self, response):
        # page = response.url.split("/")[-2]
        # filename = 'qiushi-%s.html' % page
        # with open(filename, 'wb') as f:
        #     f.write(response.body)
        # self.log('Saved file %s' % filename)

        content_left_div = response.xpath('//*[@class="col1 old-style-col1"]')
        content_list_div = content_left_div.xpath('./div')

        for content_div in content_list_div:
            item = QiushibaikeItem()
            item['author'] = content_div.xpath('./div/a[2]/h2/text()').get()
            item['content'] = content_div.xpath('./a/div/span/text()').getall()
            item['_id'] = content_div.attrib['id']

            yield item

        next_page = response.xpath(
            '//*[@class="old-style-col1"]/ul/li[last()]/a').attrib['href']

        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)
Ejemplo n.º 18
0
    def parse(self, response):
        # 区域内查找使用相对路径,否则会重复查询
        for info in response.xpath(
                '//div[contains(@class, "article block untagged mb15")]'):
            item = QiushibaikeItem()
            content = info.xpath('./a/div/span/text()').extract()
            author = info.xpath('./div/a[2]/h2/text()').extract()
            rep = re.compile(r'\\n')
            content = re.sub(rep, "", str(content)).strip()
            author = re.sub(rep, "", str(author)).strip()
            item['content'] = content
            item['author'] = author
            item['fancy'] = info.xpath('./div/span/i/text()').extract()
            item['comment'] = info.xpath('./div/span/a/i/text()').extract()
            yield item

        next_page = response.xpath(
            '//span[@class="current"]/../following-sibling::li[1]/a/@href'
        ).extract_first()
        if next_page is not None:
            # next_page = 'http:' + next_page
            yield response.follow(next_page,
                                  headers=self.headers,
                                  callback=self.parse)