Exemple #1
0
 def parse(self, response):
     item = DaomubijiItem()
     item['title'] = response.xpath('//h1[@class="focusbox-title"]/text()').extract_first()#提取小说名
     items = response.xpath('//article[@class="excerpt excerpt-c3"]')
     for href in items:
         detail_href = href.xpath('./a/@href').extract_first()#提取章节的正文链接
         yield scrapy.Request(url=detail_href, meta={'item': item}, callback=self.get_content)
Exemple #2
0
 def parse(self, response):
     selector = Selector(response)
     mulus = selector.xpath(
         "//div[@class='mulu']/div[@class='mulu-title']/center/h2/text()")
     boxs = selector.xpath("//div[@class='mulu']/div[@class='box']")
     for i in range(len(mulus)):
         mulu = mulus[i]  # 提取一个目录
         box = boxs[i]  # 提取一个box
         texts = box.xpath(".//ul/li/a/text()").extract()  # 将文本提取出来
         urls = box.xpath(".//ul/li/a/@href").extract()  # 将链接提取出来
         for j in range(len(urls)):
             item = DaomubijiItem()
             item['bookName'] = mulu.extract()
             try:
                 item['bookTitle'] = texts[j].split(' ')[0]
                 if len(texts[j].split(' ')) == 3:
                     item['chapterNum'] = texts[j].split(' ')[1]
                     item['chapterName'] = texts[j].split(' ')[2]
                 item['chapterUrl'] = urls[j]
                 request = scrapy.Request(urls[j], callback=self.parseBody)
                 request.meta['item'] = item
                 yield request
             except Exception as e:
                 print('===', texts[j])
                 print('excepiton', e)
                 continue
Exemple #3
0
    def parse_item(self, response):
        with open('first.html', 'wb') as f:
            f.write(response.body)

        hxp = Selector(response)
        urls = hxp.select(
            "//div[@class='excerpts-wrapper']/div[@class='excerpts']/article/a/@href"
        ).extract()
        titles = hxp.select(
            "//div[@class='excerpts-wrapper']/div[@class='excerpts']/article/a/text()"
        ).extract()
        item = DaomubijiItem()
        print '=========================='
        for url in titles:
            print url.split(' ')[1]
        for i in range(len(urls)):
            print 'i ===汉字是多少' + str(i)
            arr = titles[i].split(' ')
            if len(arr) >= 3:

                item['url'] = urls[0]
                item['chapter'] = arr[0]
                item['chapter_num'] = arr[1]
                item['section'] = arr[2]
                item['name'] = '盗墓笔记'
            else:
                item['url'] = urls[0]
                item['chapter'] = arr[0]
                item['chapter_num'] = arr[1]
                item['name'] = '盗墓笔记'

            # time.sleep(1)
            yield item

        print '=========================='
 def parse(self, response):
     article_lists = response.xpath(
         "//article[@class = 'excerpt excerpt-c3']")
     for article in article_lists:
         item = DaomubijiItem()
         chapter_name = article.xpath(".//a/text()").extract_first()
         chapter_urls = article.xpath(".//a/@href").extract_first()
         item['chapter_name'] = chapter_name
         item['chapter_urls'] = chapter_urls
         yield scrapy.Request(url=chapter_urls,
                              meta={'item': item},
                              callback=self.chapter_detail)
Exemple #5
0
 def parse(self, response):
     book_list = response.xpath("//table")
     for book_item in book_list:
         book_name = book_item.xpath(".//center/h2/text()").extract()[0]
         capture_list = book_item.xpath(".//tr/td/a")
         for capture_item in capture_list:
             Items = DaomubijiItem()
             capture_name = capture_item.xpath("text()").extract()[0]
             caputre_url = capture_item.xpath("@href").extract()[0]
             Items["book_name"] = book_name
             Items["capture_name"] = capture_name
             Items["caputre_url"] = caputre_url
             yield Items
 def parse_chapter(self, response):
     item = DaomubijiItem()
     book_name = response.css('.content .item-3 a::text').extract_first()
     book_info = self.book_info_dict.get(book_name, '')
     chapter_name = response.css(
         '.content .article-title::text').extract_first()
     datetime = response.css('.content .item-1::text').extract_first()
     full_content = ''
     for c_p in response.css('.content .article-content p').extract():
         full_content += c_p
     item['book_name'] = book_name
     item['book_info'] = book_info
     item['chapter_name'] = chapter_name
     item['datetime'] = datetime
     item['content'] = full_content
     yield item
    def parse_b(self, response):
        item = DaomubijiItem()
        book_name = response.css('.item.item-3 a::text').extract()
        piece = response.css('.article-content p::text').extract()
        content = "\n".join(piece)
        chapter_name = response.css('.article-header h1::text').extract_first()
        item['book_name'] = book_name
        item['content'] = content
        item['chapter_name'] = chapter_name
        yield item

        next_url = response.css(
            '.article-nav-next a::attr(href)').extract_first()
        if next_url:
            yield Request(url=next_url, callback=self.parse_b)
        else:
            print('end')
Exemple #8
0
    def parse_title(self, response):  # 提取子网页信息
        # Save raw data to Archive
        url_cahracter = response.request.url.split("/")[-1]
        chapter_file = "files/archives/chapter_index/daomuji_chapter_index_{ct}_{dt}.html".format(
            ct=url_cahracter, dt=get_now(tab=False))
        print >> open(chapter_file, "w"), response.body

        selector = Selector(response)

        book_order_name = selector.xpath('//h1/text()').extract()[0]
        pos = book_order_name.find(u':')
        book_order = book_order_name[:pos]  # 获取书编号
        book_name = book_order_name[pos + 1:]  # 获取书名

        chapter_list = selector.xpath(
            '//article[@class="excerpt excerpt-c3"]//text()').extract()
        chapter_link = selector.xpath(
            '//article[@class="excerpt excerpt-c3"]/a/@href').extract()
        chapter_link_flag = 0  # 链接序号
        for each in chapter_list:
            pos_first = each.find(' ')
            pos_last = each.rfind(' ')
            # chapter_first = ''
            chapter_mid = ''
            # chapter_last = ''
            if pos_first != pos_last:
                chapter_first = each[:pos_first]
                chapter_mid = each[(pos_first + 1):pos_last]
                chapter_last = each[pos_last + 1:]
            else:
                chapter_first = each[:pos_first]
                chapter_last = each[pos_last + 1:]

            # 存储信息
            item = DaomubijiItem()
            item['bookOrder'] = book_order
            item['bookName'] = book_name
            item['chapterFirst'] = chapter_first
            item['chapterMid'] = chapter_mid
            item['chapterLast'] = chapter_last
            yield Request(chapter_link[chapter_link_flag],
                          callback='parse_content',
                          meta={'item': item})
            chapter_link_flag += 1
Exemple #9
0
    def parse(self, response):
        sel = scrapy.selector.Selector()
        tables = sel.xpath('//table')
        items = []
        for each in tables:
            links = each.xpath('tr/td/a/@href').extract() #not tbody
            contents = each.xpath('tr/td/a/text()').extract() #not tbody
            for i in range(len(links)): #forget range!!!!!!!!!!
                item = DaomubijiItem()
                item['link'] = links[i]
                try:
                    item['book'] = contents[i].split(' ')[0]
                    item['chapter'] = contents[i].split(' ')[1]
                except Exception,e:
                    continue

                try:
                    item['bookname'] = contents[i].split(' ')[2]
                except Exception,e:
                    item['bookname'] = contents[i].split(' ')[1][-3:]
                items.append(item)
Exemple #10
0
    def parse_title(self, response):  # 提取子网页信息
        selector = Selector(response)

        book_order_name = selector.xpath('//h1/text()').extract()[0]
        pos = book_order_name.find(u':')
        book_order = book_order_name[:pos]  # 获取书编号
        book_name = book_order_name[pos + 1:]  # 获取书名

        chapter_list = selector.xpath(
            '//article[@class="excerpt excerpt-c3"]//text()').extract()
        chapter_link = selector.xpath(
            '//article[@class="excerpt excerpt-c3"]/a/@href').extract()
        chapter_link_flag = 0  # 链接序号
        for each in chapter_list:
            pos_first = each.find(' ')
            pos_last = each.rfind(' ')
            chapter_first = ''
            chapter_mid = ''
            chapter_last = ''
            if pos_first != pos_last:
                chapter_first = each[:pos_first]
                chapter_mid = each[(pos_first + 1):pos_last]
                chapter_last = each[pos_last + 1:]
            else:
                chapter_first = each[:pos_first]
                chapter_last = each[pos_last + 1:]

            # 存储信息
            item = DaomubijiItem()
            item['bookOrder'] = book_order
            item['bookName'] = book_name
            item['chapterFirst'] = chapter_first
            item['chapterMid'] = chapter_mid
            item['chapterLast'] = chapter_last
            yield Request(chapter_link[chapter_link_flag],
                          callback='parse_content',
                          meta={'item': item})
            chapter_link_flag += 1
Exemple #11
0
    def parse(self, response):
        selector = Selector(response)
        table = selector.xpath('//table')
        items = []
        for each in table:
            content = each.xpath('tr/td/a/text()').extract()
            url = each.xpath('tr/td/a/@href').extract()
            for i in range(len(url)):
                item = DaomubijiItem(
                )  #为了防止后一个数据覆盖前一个数据,需要在每个循环里都实例化一个NovelspiderItem
                item['link'] = url[i]
                # try可以用于检测错误,出现错误以后就会运行except里面的内容。
                try:
                    item['book'] = content[i].split(' ')[0]
                    item['chapter'] = content[i].split(' ')[1]
                except Exception, e:
                    continue

                try:
                    item['bookname'] = content[i].split(' ')[2]
                except Exception, e:
                    item['bookname'] = content[i].split(' ')[1][-3:]
                items.append(item)