def parse(self, response): sel = Selector(response) threadId = re.search(u'thread_id\s*\:\s*(\d*)', response.body).group(1) comicId = response.url.split("/")[-1].split(".")[0] item = CartoonItem() item['name'] = "".join(sel.css('h1.fl::text').extract()).strip() item['url'] = response.url item['hitNum'] = "".join( sel.css('div.line1>i::text').extract()).strip() searchObj = re.search(u'(.*)万', item['hitNum']) if searchObj: item['hitNum'] = int(float(searchObj.group(1)) * 10000) else: item['hitNum'] = int(item['hitNum']) item['collectionNum'] = int("".join( sel.css('a.btn_stored span i::text').extract()).strip()) item['likeNum'] = int("".join( sel.css('i#comic_month_ticket_num::text').extract()).strip()) item['caiNum'] = -1 item['webName'] = "有妖气" item['crawlTime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) commentApiUrl = "http://www.u17.com/comment/ajax.php?mod=thread&act=get_comment_php_v4&sort=create_time&thread_id=" + threadId + "&page=1&comic_id=" + comicId request = scrapy.Request(commentApiUrl, callback=self.moreparse) request.meta['item'] = item return request
def parse(self, response): ''' cmd = 'phantomjs constructDom.js "%s"' % response.url stdout,stderr = subprocess.Popen(cmd,shell= True,stdout = subprocess.PIPE,stderr = subprocess.PIPE).communicate() f = file('code.txt', 'w+') f.writelines(stdout) #print (stdout) sel = Selector(text=stdout) ''' sel = Selector(response) item = CartoonItem() item['name'] = "".join( sel.css('h2.works-intro-title strong::text').extract()).strip() item['url'] = response.url item['hitNum'] = str("".join( sel.css('p.works-intro-digi>span:nth-of-type(2)>em::text').extract( )).replace(',', '')) item['collectionNum'] = int("".join( sel.css('em#coll_count::text').extract()).replace(',', '')) #item['commentNum'] = "".join(sel.css('em.commen-ft-ts::text').extract()).strip() item['likeNum'] = int("".join( sel.css('strong#redcount::text').extract()).strip()) item['caiNum'] = int("".join( sel.css('ul.works-vote-list>li:nth-of-type(2)>strong::text'). extract()).strip()) item['webName'] = "腾讯漫画" kid = response.url.split('/')[6] commentUrl = "http://ac.qq.com/Community/topicList?targetId=" + kid + "&page=1" request = scrapy.Request(commentUrl, callback=self.moreparse) request.meta['item'] = item return request
def parse(self, response): ''' cmd = 'phantomjs constructDom.js "%s"' % response.url stdout,stderr = subprocess.Popen(cmd,shell= True,stdout = subprocess.PIPE,stderr = subprocess.PIPE).communicate() f = file('code.txt', 'w+') f.writelines(stdout) #print (stdout) sel = Selector(text=stdout) ''' sel = Selector(response) csrfToken = sel.css("input#j-csrf::attr(value)").extract()[0].strip() name = "".join(sel.css('h1.m-source-title::text').extract()).strip() bookId = response.url.split("/")[-1] item = CartoonItem() item['name'] = "".join(sel.css('h1.m-source-title::text').extract()).strip() item['url'] = response.url item['hitNum'] = "".join(sel.css('div.g-cols--float>div.g-col:nth-of-type(1)>div.metadata:nth-of-type(2)::text').re(u'人气\:(.*)')).strip() searchObj = re.search(u'(.*)万', item['hitNum']) if searchObj: item['hitNum'] = int(float(searchObj.group(1)) * 10000) else: item['hitNum'] = int(item['hitNum']) item['collectionNum'] = -1 item['likeNum'] = -1 item['caiNum'] = -1 item['webName'] = "网易漫画" item['crawlTime'] = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) commentApiUrl = "http://manhua.163.com/comment/"+bookId+"/comments?csrfToken="+csrfToken+"&bookId="+bookId+"&page=1" request = scrapy.Request(commentApiUrl, callback = self.moreparse) request.meta['item'] = item return request
def parse(self, response): #cmd = 'phantomjs constructDom.js "%s"' % response.url #stdout,stderr = subprocess.Popen(cmd,shell= True,stdout = subprocess.PIPE,stderr = subprocess.PIPE).communicate() #f = file('code.txt', 'w+') #f.writelines(stdout) #print (stdout) #sel = Selector(text=stdout) sel = Selector(response) item = CartoonItem() item['name'] = "".join(sel.css('ul.synopsises_font>li:nth-of-type(2)>span:nth-of-type(1)::text').extract()).strip() item['url'] = response.url item['hitNum'] = "".join(sel.css('ul.synopsises_font>li:nth-of-type(2)>span:nth-last-of-type(1)::text').extract()).strip() searchObj = re.search(u'(.*)万', item['hitNum']) if searchObj: item['hitNum'] = int(float(searchObj.group(1)) * 10000) else: item['hitNum'] = int(item['hitNum']) item['collectionNum'] = int("".join(sel.css('a#Mark2Pocket small::text').extract()).strip()) item['commentNum'] = int(sel.css('div.wrap_left div.content_left2>span:nth-of-type(1)>span>a::text').re(u'全部(\d*)')[0]) item['likeNum'] = int("".join(sel.css('a#DoLike small::text').extract()).strip()) item['caiNum'] = -1 item['webName'] = "sf互动传媒" item['crawlTime'] = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) return item
def parse(self, response): # 章节的链接地址数组 urls = response.xpath(".//tr//dl[@id='comiclistn']/dd/a[2]/@href").extract() # 章节的名称数组 dir_names = response.xpath(".//tr//dl[@id='comiclistn']/dd/a[1]/text()").extract() # 保存章节的链接和名称,并发送请求,传递item参数 for index in range(len(urls)): item = CartoonItem() item['link_url'] = urls[index] item['dir_name'] = dir_names[index] yield scrapy.Request(url=item['link_url'], meta={'item': item}, callback=self.parsecartoon)
def parse1(self, response): hxs = Selector(response) items = [] urls = hxs.xpath('//dd/a[1]/@href').extract() # 章节连接地址 dir_names = hxs.xpath('//dd/a[1]/text()').extract() # 章节名 for index in range(len(urls)): # 保存章节连接和章节名 item = CartoonItem() item['link_url'] = self.server_link + urls[index] item['dir_name'] = dir_names[index] items.append(item) # 根据每个章节的连接 ,发送Request请求,传递 item参数 for item in items[-13:-1]: yield scrapy.Request(url=item['link_url'], meta={'item': item}, callback=self.parse2)
def parse(self, response): ''' cmd = 'phantomjs constructDom.js "%s"' % response.url stdout,stderr = subprocess.Popen(cmd,shell= True,stdout = subprocess.PIPE,stderr = subprocess.PIPE).communicate() #f = file('code.txt', 'w+') #f.writelines(stdout) #print (stdout) sel = Selector(text=stdout) ''' item = CartoonItem() item['url'] = response.url item['name'] = re.search(u'comic_name\s*=\s*\'(.*)\'', response.body).group(1) typeId = re.search(u'obj_id\s*=\s*\"(\d*)', response.body).group(1) infoApiUrl = "http://i.dmzj.com/ajax/ding?callback=json&typeid=" + typeId request = scrapy.Request(infoApiUrl, callback = self.moreparse) request.meta['typeId'] = typeId request.meta['item'] = item return request #sel = Selector(response) '''
def parse(self, response): ''' cmd = 'phantomjs constructDom.js "%s"' % response.url stdout,stderr = subprocess.Popen(cmd,shell= True,stdout = subprocess.PIPE,stderr = subprocess.PIPE).communicate() f = file('code.txt', 'w+') f.writelines(stdout) #print (stdout) sel = Selector(text=stdout) ''' sel = Selector(response) item = CartoonItem() item['name'] = "".join( sel.css('div.weizhi::text').re(u'>>(.*)')).strip() item['url'] = response.url item['likeNum'] = -1 item['caiNum'] = -1 item['webName'] = "捧秀漫画" kid = response.url.split('/') commentApiUrl = "http://www.pengxiu.com/comment.do?doing=comment_web_ajaxlook2&kind=book&kid=" + kid[ 4] request = scrapy.Request(commentApiUrl, callback=self.moreparse) request.meta['item'] = item request.meta['kid'] = kid[4] return request