Esempio n. 1
0
    def parse(self, response):

        # 提取数据
        # 书的信息在<article class="product_pod">中
        books = response.css('article.product_pod')
        item = BookItem()
        for book in books:
            # 书籍名称
            # name = book.css('h3 a::text').extract_first()
            name = book.xpath('./h3/a/@title').extract_first()
            
            # 书籍价格
            #price = book.css('p.price_color::text').extract_first()
            price = book.xpath('./div[@class="product_price"]/p/text()').extract_first()

            # 书籍图片地址
            imgUrl = book.xpath('./div[@class="image_container"]/a/img/@src').extract_first()
            item['name'] = name,
            item['price'] = price
            item['imgUrl'] = 'http://books.toscrape.com/'+ imgUrl
            yield item
        


        # 提取下一页链接
        next_url = response.xpath('//li[@class="next"]/a/@href').extract_first()
        # next_url = response.css('ul.pager li.next a::attr(href)').extract_first()
        if next_url:
            # 构造下一页链接
            next_url = response.urljoin(next_url)
            # 构造新的Request对象
            yield scrapy.Request(next_url, callback=self.parse)
Esempio n. 2
0
    def parse_chapter(self, response):
        item = BookItem()
        item['c_title'] = response.css('h1::text').extract_first()
        item['content'] = response.xpath(
            '//*[@id="content"]/p/text()').extract()

        yield item
Esempio n. 3
0
    def parse(self, response):
        try:

            node_list = response.xpath("//div[@class='book-info']")

            for node in node_list:

                item = BookItem()

                item['book_name'] = node.xpath("./h3/a/text()").extract()[0]

                item['book_type'] = node.xpath(
                    "./p[1]/span[1]/text()").extract()[0]

                item['book_stat'] = node.xpath(
                    "./p[1]/span[2]/text()").extract()[0]

                item['book_author'] = node.xpath(
                    "./p[1]/span[3]/text()").extract()[0]

                yield item

            if self.pageNum != 16:
                self.pageNum += 1
                yield scrapy.Request(self.baseURL + str(self.pageNum),
                                     callback=self.parse)
        except Exception as e:
            print(e)
Esempio n. 4
0
 def detil(self, response):
     image_url = response.xpath(
         "//div[@class='detail']/a/img/@src").extract_first()
     book_name = response.xpath(
         "//div[@class='detail']/div/h1/text()").extract_first()
     bool_jianjie = response.xpath(
         "//div[@class='detail']/div/div/div/text()").extract_first()
     bool_jianjie = bool_jianjie.strip()
     read_url1 = response.xpath(
         "//a[@class='reader']/@href").extract_first()
     book_type = response.xpath(
         "//a[@class='c009900']/text()").extract_first()
     booker = response.xpath(
         "//dl[@class='bookso']/dd/text()").extract_first()
     try:
         str_list = read_url1.split('/')
         book_id = str_list[-1]
     except:
         book_id = 403
     r = BookItem(image_url=image_url,
                  book_name=book_name,
                  bool_jianjie=bool_jianjie,
                  book_id=book_id,
                  book_type=book_type,
                  booker=booker)
     yield r
     text = scrapy.Request(url=read_url1,
                           callback=self.read1,
                           meta={'book_id': book_id})
     yield text
Esempio n. 5
0
 def parse(self, response):
     #大分类分组
     div_list = response.xpath(
         "//div[@class='con flq_body']/div[4]")  # 数据太多了,选取第四个div为小说的分类来爬
     for div in div_list:
         item = BookItem()
         item["b_cate"] = div.xpath("./dl/dt//text()").extract()
         item["b_cate"] = [
             i.strip() for i in item["b_cate"] if len(i.strip()) > 0
         ]
         #中间分类分组
         dl_list = div.xpath("./div//dl[@class='inner_dl']")
         for dl in dl_list:
             item["m_cate"] = dl.xpath("./dt//text()").extract()
             item["m_cate"] = [
                 i.strip() for i in item["m_cate"] if len(i.strip()) > 0
             ][0]
             #小分类分组
             a_list = dl.xpath("./dd/a")
             for a in a_list:
                 item["s_href"] = a.xpath("./@href").extract_first()
                 item["s_cate"] = a.xpath("./text()").extract_first()
                 if item["s_href"] is not None:
                     yield scrapy.Request(item["s_href"],
                                          callback=self.parse_book_list,
                                          meta={"item": deepcopy(item)})
Esempio n. 6
0
    def parse_item(self, response):
        print('bar')
        item = BookItem()

        name_info = response.xpath('//div[@class="name_info"]')
        item['title'] = name_info.xpath('./h1/@title').extract()[0]
        item['intro'] = name_info.xpath('./h2/span/@title').extract()[0]

        messbox_info = response.xpath('//div[@class="messbox_info"]')
        item['author'] = messbox_info.xpath(
            './/span[@id="author"]/a/text()').extract()[0]
        item['publisher'] = messbox_info.xpath(
            './/span[@ddt-area="003"]/a/text()').extract()[0]
        item['star'] = messbox_info.xpath(
            './/span[@class="star"]/@style').extract()[0]

        item['image'] = response.xpath(
            '//div[@class="pic_info"]//img/@src').extract()[0]
        item['price'] = response.xpath(
            '//div[@class="price_pc"]//p/text()').extract()[0]

        pro_content = response.xpath('//div[@class="pro_content"]')
        item['ISBN'] = pro_content.xpath(
            './ul[@class="key clearfix"]/li/text()').extract()[9]
        item['tag0'] = pro_content.xpath(
            './/span[@class="lie"]/a/text()').extract()[0]
        item['tag1'] = pro_content.xpath(
            './/span[@class="lie"]/a/text()').extract()[1]
        item['tag2'] = pro_content.xpath(
            './/span[@class="lie"]/a/text()').extract()[2]
        item['link'] = response.url
        item['web'] = 'Dangdang'

        self.items.append(item)
        return self.items
Esempio n. 7
0
 def parse(self, response):
     books = response.xpath('//ul[@class="bang_list"]/li')
     items = []
     for i in range(0, 19):
         item = BookItem()
         url = books.xpath('.//div[@class="pic"]/a/@href').extract()[i]
         print('foo {}'.format(url))
         yield scrapy.Request(url=url, callback=self.parse_item)
Esempio n. 8
0
    def parse(self, response):

        '''
        start_requests已经爬取到页面,那如何提取我们想要的内容呢?那就可以在这个方法里面定义。
        这里的话,并木有定义,只是简单的把页面做了一个保存,并没有涉及提取我们想要的数据,后面会慢慢说到
        也就是用xpath、正则、或是css进行相应提取,这个例子就是让你看看scrapy运行的流程:
        1、定义链接;
        2、通过链接爬取(下载)页面;
        3、定义规则,然后提取数据;
        就是这么个流程,似不似很简单呀?
        '''
        meta = response.meta

        filename = response.xpath('//div[@class="bookname"]/h1/text()').extract_first()
        if filename is not None:
            filename = filename.replace("正文", "").strip()
        else:
            return

        if 'next' in meta:
            next = meta['next']
        else:
            next = 0
        if 'url' in meta:
            url = meta['url']
        else:
            url = ''
        # 保存数据
        item = BookItem()
        item['url'] = url
        item['title'] = filename
        contents = response.xpath('//div[@id="content"]/text()').extract()
        content = ''
        for it in contents:
            if it == '\r\n':
                continue
            content += it + '\r\n'
        item['content'] = content
        yield item

        # 不自动查找下一页, 直接停止
        if next == 0:
            return

        taga = response.xpath('//div[@class="bottem1"]/a')  # css选择器提取下一页链接

        next_page = ''
        for a in taga:
            ts = a.xpath('text()').extract_first()
            if ts == '下一章':
                next_page = a.xpath('@href').extract_first()
                break
        logging.info(next_page)
        if next_page.strip() != '':  # 判断是否存在下一页
            next_page = response.urljoin(next_page)
            # 太快网站会返回失败信息
            yield scrapy.Request(next_page, meta={"url": next_page, "next": 1}, callback=self.parse)
Esempio n. 9
0
 def parse_page1(self,response):
     '''
     需要知道的是item是一个字典
     '''
     item = BookItem()
     request = scrapy.Request("http://www.example.com/some_page.html",
                          callback=self.parse_page2)
     request.meta['item'] = item
     return request
     '''比如我们要爬取淘宝上的商品,我们在第一层爬取时候获得了标题(title)和价格(price),
Esempio n. 10
0
    def parse(self, response):

        for article in response.xpath("//article[@class = 'product_pod']"):
            loader = ItemLoader(item=BookItem(), selector=article)
            img_link = article.xpath(
                ".//div[@class = 'image_container']/a/img/@src").get()
            absolute_url = response.urljoin(img_link)
            loader.add_value('image_urls', absolute_url)
            loader.add_xpath('book_name', './/h3/a/@title')

            yield loader.load_item()
Esempio n. 11
0
 def parse(self, response):
     title_xpath = "//div[@class='bookname']/h1/text()"
     body_xpath = "//div[@id='content']/text()"
     next_page_xpath = "//div[@class='bottem1']/a[3]/@href"
     body = response.body.decode('gbk')
     item = BookItem()
     item["title"] = Selector(text=body).xpath(title_xpath).extract()[0]
     item["body"] = '\n'.join(
         Selector(text=body).xpath(body_xpath).extract()[1:])
     yield item
     url = Selector(text=body).xpath(next_page_xpath).extract()[0]
     yield scrapy.Request("https://www.23txt.com" + url)
Esempio n. 12
0
    def parse_content(self, response, **kwargs):
        item = BookItem()
        soup = BeautifulSoup(response.text, "lxml")
        same_name = response.cb_kwargs.get('book_name')
        item['book_name'] = same_name
        item['chapter_title'] = soup.find('font', attrs={'size': '4'}).text
        item['chapter_content'] = soup.find('td', attrs={'width': '820'}).text
        yield item

        next_page = soup.find('td', attrs={'width': '28%'}).find('a')
        if next_page:
            yield response.follow(next_page.get('href'), callback=self.parse_content, cb_kwargs={"book_name": same_name})
Esempio n. 13
0
 def parse(self, response):
     item=BookItem()
     sel=Selector(response)
     imgs=sel.xpath('//*[@class="doulist-item"]')
     item['url']=[]
     item['name']=[]
     for img in imgs:
         site=img.xpath('div/div[2]/div[2]/a/img/@src').extract_first()
         img_name=img.xpath('div/div[2]/div[3]/a/text()').extract_first()
         img_name=img_name.split()[0]
         item['url'].append(site)
         item['name'].append(img_name)
         yield item
Esempio n. 14
0
File: jd.py Progetto: ywx1992/spider
    def parse(self, response):
        dt_list = response.xpath("//div[@class='mc']/dl/dt")  # 大分类列表
        for dt in dt_list:
            item = BookItem()
            item["b_cate"] = dt.xpath("./a/text()").extract_first()
            em_list = dt.xpath("./following-sibling::dd[1]/em")  # 小分类列表
            for em in em_list:
                item["s_cate"] = em.xpath("./a/text()").extract_first()
                item["s_href"] = em.xpath("./a/@href").extract_first()

                if item["s_href"] is not None:
                    item["s_href"] = "https:" + item["s_href"]
                    yield scrapy.Request(item["s_href"],
                                         callback=self.parse_book_list,
                                         meta={"item": deepcopy(item)})
Esempio n. 15
0
 def get_chapterurl(self, response):
     item = BookItem()
     item['name'] = str(response.meta['name']).replace('\xa0', '')
     item['novelurl'] = response.meta['url']
     category = BeautifulSoup(response.text,
                              'lxml').find('table').find('a').get_text()
     author = BeautifulSoup(
         response.text, 'lxml').find('table').find_all('td')[1].get_text()
     bash_url = BeautifulSoup(response.text, 'lxml').find(
         'p', class_='btnlinks').find('a', class_='read')['href']
     name_id = str(bash_url)[-6:-1].replace('/', '')
     item['category'] = str(category).replace('/', '')
     item['author'] = str(author).replace('/', '')
     item['name_id'] = name_id
     return item
Esempio n. 16
0
 def parse(self, response):
     item = BookItem()
     nodes = response.xpath('//div[@class="article"]//tr[@class="item"]')
     for node in nodes:
         name = node.xpath('td[2]/div[1]/a/text()').extract_first().strip()
         summary = node.xpath('td[2]/p[2]/span/text()').extract_first()
         item['name'] = name
         item['summary'] = summary
         yield item
     next_urls = response.xpath(
         '//div[@class="paginator"]//span[@class="next"]/a/@href'
     ).extract_first()
     if next_urls:
         yield Request(url=parse.urljoin(response.url, next_urls),
                       callback=self.parse)
Esempio n. 17
0
 def parse_content(self, response):
     '''
     抓取章节内容
     '''
     arr = BookItem()
     arr['list_name'] = response.meta['list_name']
     arr['list_url'] = response.meta['list_url']
     arr['num'] = response.meta['num']
     arr['desc']= response.meta['desc']
     arr['file_name']= response.meta['file_name']
     title = response.xpath('//h2/text()').extract_first()
     content = response.xpath('//div[@id="box"]//p[@class="Text"]/text()').extract()
     arr['content'] = "\n".join(content)
     arr['title'] = title
     yield arr
Esempio n. 18
0
 def parse(self, response):
     if 'cid' in response.url:
         urls = response.xpath('//a[@class="a_7 fl"]/@href').extract()
         for url in urls:
             yield scrapy.Request(url)
     elif 'view' in response.url:
         item = BookItem()
         item['name'] = response.xpath(
             '//h2[@class="h_10"]/text()').extract()[0].replace('\t', '')
         item['price'] = response.xpath(
             '//h2[@class="h_10"]/span/text()').extract()[0]
         item['isbn'] = response.xpath(
             '//div[@class="div_47 fix"]/span/text()').extract()[1].replace(
                 'ISBN:', '')
         item['url'] = response.url
         yield item
Esempio n. 19
0
 def parse(self, response):
     li_list = response.xpath('//ul[@class="ulwrap"]/li')
     for li in li_list:
         item = BookItem()
         item['b_cate'] = li.xpath('./div[1]/a/text()').extract_first()  # 大分类 名称
         a_list = li.xpath('./div[2]/a')
         for a in a_list:
             item['s_href'] = a.xpath("./@href").extract_first() # 小分类 链接
             item["s_cate"] = a.xpath('./text()').extract_first()    # 小分类名称
             if item["s_href"] is not None:
                 item["s_href"] = "http://snbook.suning.com"+item["s_href"]  # 补全链接
                 yield scrapy.Request(
                     url=item['s_href'],
                     callback=self.parse_book_list,
                     meta=deepcopy(item)
                 )
Esempio n. 20
0
    def parse_book(self, response):
        book = BookItem()

        item = response.css('div.product_main')
        book['name'] = item.css('div.product_main h1::text').extract_first()
        book['price'] = item.css('p.price_color::text').extract_first()
        book['review_rating'] = item.css(
            'p.star-rating::attr(class)').re_first('star-rating ([A-Za-z]+)')
        book['upc'] = response.css(
            'table.table-striped tr:nth-child(1) td::text').extract_first()
        book['stock'] = response.css(
            'table.table-striped tr:nth-child(6) td::text').re_first(
                'In stock \((\d+) available\)')
        book['review_num'] = response.css(
            'table.table-striped tr:nth-child(7) td::text').extract_first()

        yield book
Esempio n. 21
0
    def parse_list(self, response):
        item = BookItem()

        content_title = response.css('div.cont_title')
        item['title'] = content_title.css('h1::text').extract_first()
        item['author'] = content_title.css(
            'div.hslice>p.entry-title>a::text').extract_first()
        item['thumb'] = content_title.css(
            'div.hslice>div.thumb>img::attr(src)').extract_first()

        details = response.css('div#book_detail')
        details.remove(details[0])  #删除最新章节
        chapters = details.css('ol>li')

        for c_list in chapters:
            c_link = c_list.css('a::attr(href)').extract_first()
            response.follow(c_link, callback=self.parse_chapter)
Esempio n. 22
0
    def parse(self, response):
        print("-"*10)
        #print("hello, world")
        item = BookItem()
        item['title'] = response.xpath("//a[@class='pic']/@title").extract()
        item['price'] = response.xpath("//span[@class='search_now_price']/text()").extract()
        item['pic'] = response.xpath("//a[@class='pic']/img/@data-original").extract()


        
        item['author'] = response.xpath("//a[@name='itemlist-author']/text()").extract()
        item['publish'] = response.xpath("//a[@name='P_cbs']/@title").extract()
        item['time'] = response.xpath("//p[@class='search_book_author']/span[2]/text()").extract()

        yield item
        for i in range(1, 6):
            url = "http://search.dangdang.com/?key=%BC%C6%CB%E3%BB%FA&act=input&page_index="+str(i)
            yield Request(url, callback=self.parse)
Esempio n. 23
0
    def parse_content(self, response):
        base = 'https://www.99csw.com'
        item = BookItem()
        soup = BeautifulSoup(response.text, "lxml")

        same_book_name = response.meta.get('book_name')
        item['book_name'] = same_book_name
        item['chapter_title'] = soup.find('h2').text
        item['chapter_content'] = soup.find('div', attrs={
            "id": "content"
        }).text
        yield item

        next_page = soup.find('a', attrs={'id': 'next'})
        if next_page:
            next_url = base + next_page.get('href')
            yield scrapy.Request(next_url,
                                 callback=self.parse_content,
                                 meta={'book_name': same_book_name})
Esempio n. 24
0
 def parse(self, response):
     item = BookItem()
     # 通过各Xpath表达式分别提取商品的名称、价格、链接、评论数等信息
     item["name"] = response.xpath("//a[@class='pic']/@title").extract()
     item["price"] = response.xpath(
         "//span[@class='price_n']/text()").extract()
     item["link"] = response.xpath("//a[@class='pic']/@href").extract()
     item["comnum"] = response.xpath(
         "//a[@name='itemlist-review']/text()").extract()
     # 提取完后返回item
     yield item
     # 接下来很关键,通过循环自动爬取20页的数据
     for i in range(1, 21):
         # 通过上面总结的网址格式构造要爬取的网址
         url = "http://search.dangdang.com/?key=python&act=input&show=big&page_index=" + str(
             i) + "#J_tab"
         # 通过yield返回Request,并指定要爬取的网址和回调函数
         # 实现自动爬取
         yield Request(url, callback=self.parse)
Esempio n. 25
0
 def parse_book_info(self, response):
     soup = BeautifulSoup(response.text, 'lxml')
     item = BookItem()
     item['book_name'] = self.get_text(soup.select_one('h2.book-name'))
     item['author'] = soup.select_one('p.book-author').text.strip().split(
         '(')[0].strip().replace('\n', '').replace(' ', '').strip()
     item['translator'] = self.get_list_one_text(
         re.findall(re.compile('<a.*>(.*?)\\s+</a>\\s+\(译者\)'),
                    response.text))
     item['editor'] = self.get_list_one_text(
         re.findall(re.compile('维护人:\\S+<a.*>(.*?)</a>'), response.text))
     price_text = re.findall(re.compile('price\\S>(.*?)</span>'),
                             response.text)
     item['price'] = None if (
         len(price_text) < 2
         or price_text[1].find('¥') < 0) else price_text[1]
     item['isbn'] = self.get_list_one_text(
         re.findall(re.compile('书\\s+号\\S+>(.*?)\\s+</li>'), response.text))
     item['publish_status'] = self.get_list_one_text(
         re.findall(re.compile('出版状态\\S+>(.*?)\\s+</li>'), response.text))
     item['publish_date'] = self.get_list_one_text(
         re.findall(re.compile('出版日期\\S+>(.*?)\\s+</li>'), response.text))
     item['origin_book_name'] = self.get_list_one_text(
         re.findall(re.compile('原书名\\S+n>\\s+(.*?)\\s+</li>'),
                    response.text))
     item['origin_book_price'] = None
     item['pages'] = self.get_list_one_text(
         re.findall(re.compile('页\\s+数\\S+>(.*?)\\s+</li>'), response.text))
     item['format'] = self.get_list_one_text(
         re.findall(re.compile('开\\s+本\\S+>(.*?)\\s+</li>'), response.text))
     item['introduction'] = self.get_text(soup.select_one('div#abstract'))
     item['origin_book_isbn'] = self.get_list_one_text(
         re.findall(re.compile('原书号\\S+>(.*?)\\s+</li>'), response.text))
     item['avatar'] = soup.select_one('div.book-detail-img  > a > img').get(
         'src')
     item['tags'] = self.get_tags_text(
         soup.select('div.block-tag > div.block-body > ul > li > a'))
     item['book_url'] = response.request.url
     item['website'] = '博文视点'
     #特殊处理
     if str(item['isbn']).strip() == '':
         item['isbn'] = None
     yield item
Esempio n. 26
0
    def parse_detail(self, response):

        ipQuery = response.meta["ipQuery"]

        html = response.text
        item = BookItem()
        item["name"] = response.xpath(
            '//h1[@id="itemDisplayName"]/text()').extract_first().strip()

        # 价格是js渲染
        # 1. splash
        # 2. js逆向

        luaUrl = "https:" + re.findall(r'"luaUrl":"(.*?)"', html)[0]
        passPartNumber = re.findall(r'"passPartNumber":"(.*?)"', html)[0]
        partNumber = re.findall(r'"partNumber":"(.*?)"', html)[0]
        vendorCode = re.findall(r'"vendorCode":"(.*?)"', html)[0]
        provinceCode = ipQuery["provinceCommerceId"]
        lesCityId = ipQuery["cityLESId"]
        lesDistrictId = ipQuery["districtLESId"]
        a = lesCityId + lesDistrictId + "01"
        category1 = re.findall(r'"category1":"(.*?)"', html)[0]
        mdmCityId = ipQuery["cityMDMId"]
        cityId = ipQuery["cityCommerceId"]
        districtId = ipQuery["districtCommerceId"]
        cmmdtyType = re.findall(r'"cmmdtyType":"(.*?)"', html)[0]
        custLevel = ""
        mountType = re.findall(r'"mountType":"(.*?)"', html)[0]

        if mountType != "03":
            b = ""
        else:
            b = mountType

        catenIds = re.findall(r'"catenIds":"(.*?)"', html)[0]
        weight = re.findall(r'"weight":"(.*?)"', html)[0]
        e = ""

        price_url = luaUrl + "/nspcsale_0_" + passPartNumber + "_" + partNumber + "_" + vendorCode + "_" + provinceCode + "_" + lesCityId + "_" + a + "_" + category1 + "_" + mdmCityId + "_" + cityId + "_" + districtId + "_" + cmmdtyType + "_" + custLevel + "_" + b + "_" + catenIds + "_" + weight + "___" + e + ".html"

        yield scrapy.Request(url=price_url,
                             callback=self.parse_price,
                             meta={"item": item})
Esempio n. 27
0
 def parse(self, response):
     item = BookItem()
     conent = response.css('div.sons:nth-child(2) > div:nth-child(1)')
     for con in conent:
         cen = con.css('head > meta:nth-child(5)::attr(content)').extract()
         name = con.css(
             'div.sons:nth-child(2) > div:nth-child(1) > h1:nth-child(2)::text'
         ).extract()
         time = con.css(
             'div.sons:nth-child(2) > div:nth-child(1) > p:nth-child(3) > a:nth-child(1)::text'
         ).extract()
         zuozhe = con.css(
             'div.sons:nth-child(2) > div:nth-child(1) > p:nth-child(3) > a:nth-child(3)::text'
         ).extract()
         item['cen'] = cen
         item['name'] = name
         item['time'] = time
         item['zuozhe'] = zuozhe
         yield item
Esempio n. 28
0
 def parse_price(self, response):
     item = BookItem()
     item['title'] = response.xpath(
         '//div[@class="name_info"]/h1/@title').extract()
     item['comment_num'] = response.xpath(
         '//a[@id="comm_num_down"]/text()').extract()
     item['link'] = response.url
     item['price'] = response.xpath(
         '//p[@id="dd-price"]/text()').extract()[1]
     item['img_url'] = response.xpath(
         '//img[@id="modalBigImg"]/@src').extract()
     item['cate_1'] = response.xpath(
         '//div[@class="breadcrumb"]/a[@class="domain"]/b/text()').extract(
         )
     item['cate_2'] = response.xpath(
         '//div[@class="breadcrumb"]/a/text()').extract()[0]
     item['cate_3'] = response.xpath(
         '//div[@class="breadcrumb"]/a/text()').extract()[1]
     yield item
Esempio n. 29
0
 def parse(self, response):
     menu_pattern = re.compile("menu:(\[{.*}\]).*?submenu:", re.S)
     result = menu_pattern.findall(response.text)
     pattern = re.compile("NAME: '(.*?)',URL: '(.*?)',id: '\d+',children:")
     cata_list = pattern.findall(result[0])
     for cata in cata_list[1:]:
         item = BookItem()
         item['cata_title'] = cata[0]
         item['cata_url'] = cata[1]
         if not item['cata_url'].startswith('//list'):
             num_pattern = re.compile(".*/(\d+)-(\d+)-(\d+).html")
             result = num_pattern.findall(item['cata_url'])[0]
             item[
                 'cata_url'] = '//list.jd.com/list.html?cat={},{}&tid={}'.format(
                     result[0], result[1], result[2])
         item['cata_url'] = 'https:' + item['cata_url']
         # print(item)
         yield scrapy.Request(item['cata_url'],
                              callback=self.parse_list,
                              meta={'item': deepcopy(item)})
Esempio n. 30
0
    def parse_content(self, response):
        base = 'https://www.99csw.com'
        item = BookItem()
        soup = BeautifulSoup(response.text, "lxml")

        same_book_name = response.meta.get('book_name')
        item['book_name'] = same_book_name
        item['chapter_title'] = soup.find('h2').text
        item['chapter_content'] = soup.find('div', attrs={"id": "content"}).text
        yield item

        next_page = soup.find('a', attrs={'id': 'next'})
        if next_page:
            next_url = base + next_page.get('href')
            js = "window.scrollTo(0,document.body.scrollHeight)"
            yield SeleniumRequest(next_url,
                                  script=js,
                                  # handler=self._handle_js,
                                  callback=self.parse_content,
                                  meta={'book_name': same_book_name})