def parse1(self, response):
     hxs = Selector(response)
     items = []
     # 章节链接地址
     urls = hxs.xpath('//dd/a[1]/@href').extract()
     # 章节名
     dir_names = hxs.xpath('//dd/a[1]/text()').extract()
     # 保存章节链接和章节名
     for index in range(len(urls)):
         item = ComicItem()
         item['link_url'] = self.server_link + urls[index]
         item['dir_name'] = dir_names[index]
         items.append(item)
     #根据每个章节的链接,发送request请求,并传递item参数
     for item in items[-13:-1]:
         yield scrapy.Request(url=item['link_url'], meta={'item': item}, callback=self.parse2)
    def parse1(self, response):
        hxs = Selector(response)
        items = []
        #章節鏈接地址
        urls = hxs.xpath('//dd/a[1]/@href').extract()
        #章節名
        dir_names = hxs.xpath('//dd/a[1]/text()').extract()
        #保存章節鏈接和章節名
        for index in range(len(urls)):
            item = ComicItem()
            item['link_url'] = self.server_link + urls[index]
            item['dir_name'] = dir_names[index]
            items.append(item)

        #根據每個章節的鏈接,發送Request請求,並傳遞item參數
        for item in items:
            yield scrapy.Request(url=item['link_url'],
                                 meta={'item': item},
                                 callback=self.parse2)
Beispiel #3
0
    def parse1(self, response):
        hxs = Selector(response)
        items = []
        # 章节链接地址
        urls = hxs.xpath('//dd/a[1]/@href').extract()
        # 章节名
        dir_names = hxs.xpath('//dd/a[1]/text()').extract()
        # 保存章节链接和章节名
        for index in range(len(urls)):
            item = ComicItem()
            item['link_url'] = self.server_link + urls[index]
            item['dir_name'] = dir_names[index].replace(" ", "_")
            items.append(item)
        # Find last downloaded file name, then download newly published episode
        print('Current file amount: '+str(len(items)))
        print('Index of last update: '+lastlen)
        with open (BASE_DIR+'/logg.txt','w') as w:
            w.write(str(len(items)))

        # 根据每个章节的链接,发送Request请求,并传递item参数
        for item in items[int(lastlen):]:
            print(item['dir_name'])
            yield scrapy.Request(url=item['link_url'], meta={'item': item}, callback=self.parse2)