def get_link(self, response): item = ManhuaItem() result = response.xpath('//script/text()').extract() href = re.search(r'<IMG SRC=[\s\S]*?(newkuku[\s\S]*?.jpg)\'>', result[0]) jpg_url = 'http://n5.1whour.com/'+href.group(1) floder_name = response.meta['floder_name'] page = re.search(r'/(\d+).htm', response.meta['url']).group(1) filename = floder_name + '/' + page + '.jpg' item['url'] = jpg_url item['floder_name'] = floder_name item['filename'] = filename yield item
def parse(self, response): item = ManhuaItem() item['img_urls'] = response.xpath( '/html/body/div[@class="main"]/div[1]/div[1]/div[2]/div[4]/a/img/@src' ).extract() yield item new_url_first = response.xpath( '/html/body/div[2]/div[1]/div[1]/div[3]/a[10]/@href' ).extract_first() new_url = 'http://www.xieeqiao.com/manhua/' + new_url_first if new_url: yield scrapy.Request(new_url, callback=self.parse)
def parse_tu(self, response): neirong = response.xpath('/html/body/script[2]/text()').extract()[0] pattern = re.compile(r'http:.*jpg-smh\.middle') urljpg = pattern.findall(neirong)[0] re1 = re.compile(r'","') jpg = re1.split(urljpg) for url in jpg: # item = ManhuaItem() item["path"] = os.path.join("ok", url.split('/')[-2]) item["filename"] = url.split('/')[-1].split("-")[0] item["image_url"] = url yield item
def mai(self, response): URL = 'https://manhua.dmzj.com' for i in response.css('div.cartoon_online_border ul li').extract(): text = ManhuaItem() i = pq(i) text['name'] = i('a').attr('title') text['chapter'] = i("a").text() text['link'] = [ URL + i("a").attr('href') + '#@page=' + str(t) for t in range(1, 13) ] yield text for uri in text['link']: #self.logger.info('='*30+uri) yield scrapy.Request(url=uri, callback=self.download)
def parse1(self, response): hxs = Selector(response) urls = response.xpath('//dd/a[1]/@href').extract() dir_names = response.xpath('//dd/a[1]/text()').extract() for index in range(len(urls)): # if dir_names[index].split(' ')[1][-1:] != '话': # if input(dir_names[index].split(' ')[1] + '是否忽略(y or n)') == 'y': # continue if (float(dir_names[index].split(' ')[1][:-1]) >= self.start_order_down and float(dir_names[index].split(' ')[1][:-1]) <= self.start_order_up) or self.is_all: item = ManhuaItem() item['link_url'] = self.server_link + urls[index] item['dir_name'] = dir_names[index] yield scrapy.Request(url=item['link_url'], meta={'item': item}, callback=self.parse2)
def download(self, response): self.logger.info('=' * 30 + response.text) try: text = ManhuaItem() text['name'] = response.css( 'h1.hotrmtexth1 a::attr(title)').extract_first() text['chapter'] = response.css( 'span.redhot1::text').extract_first() self.logger.info('*' * 30 + text['name'] + '*' * 10 + text['chapter']) text['link'] = 'https:' + (response.xpath( '//*[@id="center_box"]/img/@src').extract_first()) #text['link']='https:'+response.css('div[id="center_box"] img::attr(src)').extract_first() self.logger.warn('*' * 30 + text['link']) yield text #self.logger.info('*'*30+text['link']) self.logger.debug(str(text)) except: pass
def get_chapterurls(self, response): result = response.xpath('//td[@align="center"]/text()').extract() text = result[2] result = re.search(r'共(\d+)页', text) name = result.group(0) maxnum = result.group(1) floder_name = response.meta['floder_name'] + ' ' + name item = ManhuaItem() result2 = response.xpath('//script/text()').extract() href = re.search(r'<IMG SRC=[\s\S]*?(newkuku[\s\S]*?.jpg)\'>', result2[0]) jpg_url = 'http://n5.1whour.com/' + href.group(1) filename = floder_name + '/' + '1.jpg' item['url'] = jpg_url item['floder_name'] = floder_name item['filename'] = filename yield item for i in range(1, int(maxnum)+1): photo_url = response.meta['new_url'].replace(r'1.htm', str(i)+'.htm') yield scrapy.Request(photo_url, headers=self.headers, callback=self.get_link, meta={'floder_name': floder_name, 'url': photo_url}, dont_filter=True)
def parse(self, response): item = ManhuaItem() imgs = response.xpath('//*[@id="comicContain"]/li') pattern = r'https://manhua.qpic.cn/manhua_detail' xiayiyexpath = '//*[@id="mainControlNext"]/@href' imgtitle = response.css('title::text').extract()[0].split(u'-')[0] count = 1 for i in imgs: if len(i.xpath('img/@src').extract()) >= 1: imgurl = i.xpath('img/@src').extract()[0] else: imgurl = '' if imgurl != '' and imgurl.startswith(pattern): item['title'] = imgtitle.encode("utf-8") item['url'] = imgurl.encode("utf-8") item['imgname'] = 'img' + str(count) count = count + 1 yield item next_page = response.xpath(xiayiyexpath).extract_first() if next_page is not None: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse)
def parse_detailed(self, response): item = ManhuaItem() jshtml = response.xpath('//div[@class="jshtml"]') # 书名 item['name'] = jshtml.xpath('./ul/li[1]/text()').extract_first().split( ':')[-1] print(item['name']) # 图片 # import ipdb; ipdb.set_trace() item['img'] = response.xpath( '//div[@id="offlinebtn-container"]/img/@data-url').extract_first() # 状态 item['state'] = jshtml.xpath( './ul/li[2]/text()').extract_first().split(':')[-1] # 作者 item['author'] = jshtml.xpath( './ul/li[3]/text()').extract_first().split(':')[-1] # 类型 item['_type'] = jshtml.xpath( './ul/li[4]/text()').extract_first().split(':')[-1] # 简介 item['title'] = jshtml.xpath( './div/div[@class="wz clearfix t1"]/div/text()').extract_first() # 更新 item['update'] = jshtml.xpath('./ul/li[5]/text()').extract_first() # 章节 data_chapter = {} for chapter in response.xpath('//ul[@id="topic1"]//li')[::-1]: chapter_name = chapter.xpath('./a/@title').extract_first() href = chapter.xpath('./a/@href').extract_first() chapter_url = furl(response.url).remove(path=True).join(href).url # list_url = self.parse_chapter(chapter_url) data_chapter[chapter_name] = chapter_url # yield scrapy.http.Request(url=chapter_url, callback=self.parse_chapter) # break # import ipdb; ipdb.set_trace() str_chapter = str(data_chapter).replace("'", '"') item['chapter'] = str_chapter yield item