예제 #1
0
 def parse(self, response):
     sel = Selector(response)
     threadId = re.search(u'thread_id\s*\:\s*(\d*)', response.body).group(1)
     comicId = response.url.split("/")[-1].split(".")[0]
     item = CartoonItem()
     item['name'] = "".join(sel.css('h1.fl::text').extract()).strip()
     item['url'] = response.url
     item['hitNum'] = "".join(
         sel.css('div.line1>i::text').extract()).strip()
     searchObj = re.search(u'(.*)万', item['hitNum'])
     if searchObj:
         item['hitNum'] = int(float(searchObj.group(1)) * 10000)
     else:
         item['hitNum'] = int(item['hitNum'])
     item['collectionNum'] = int("".join(
         sel.css('a.btn_stored span i::text').extract()).strip())
     item['likeNum'] = int("".join(
         sel.css('i#comic_month_ticket_num::text').extract()).strip())
     item['caiNum'] = -1
     item['webName'] = "有妖气"
     item['crawlTime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                       time.localtime(time.time()))
     commentApiUrl = "http://www.u17.com/comment/ajax.php?mod=thread&act=get_comment_php_v4&sort=create_time&thread_id=" + threadId + "&page=1&comic_id=" + comicId
     request = scrapy.Request(commentApiUrl, callback=self.moreparse)
     request.meta['item'] = item
     return request
    def parse(self, response):
        '''
        cmd = 'phantomjs constructDom.js "%s"' % response.url
        stdout,stderr = subprocess.Popen(cmd,shell= True,stdout = subprocess.PIPE,stderr = subprocess.PIPE).communicate()
        f = file('code.txt', 'w+')
        f.writelines(stdout)
        #print (stdout)
        sel = Selector(text=stdout)
        '''
        sel = Selector(response)
        item = CartoonItem()
        item['name'] = "".join(
            sel.css('h2.works-intro-title strong::text').extract()).strip()
        item['url'] = response.url
        item['hitNum'] = str("".join(
            sel.css('p.works-intro-digi>span:nth-of-type(2)>em::text').extract(
            )).replace(',', ''))

        item['collectionNum'] = int("".join(
            sel.css('em#coll_count::text').extract()).replace(',', ''))

        #item['commentNum'] = "".join(sel.css('em.commen-ft-ts::text').extract()).strip()
        item['likeNum'] = int("".join(
            sel.css('strong#redcount::text').extract()).strip())
        item['caiNum'] = int("".join(
            sel.css('ul.works-vote-list>li:nth-of-type(2)>strong::text').
            extract()).strip())
        item['webName'] = "腾讯漫画"
        kid = response.url.split('/')[6]
        commentUrl = "http://ac.qq.com/Community/topicList?targetId=" + kid + "&page=1"
        request = scrapy.Request(commentUrl, callback=self.moreparse)
        request.meta['item'] = item
        return request
예제 #3
0
 def parse(self, response):
     '''
     cmd = 'phantomjs constructDom.js "%s"' % response.url
     stdout,stderr = subprocess.Popen(cmd,shell= True,stdout = subprocess.PIPE,stderr = subprocess.PIPE).communicate()
     f = file('code.txt', 'w+')
     f.writelines(stdout)
     #print (stdout)
     sel = Selector(text=stdout)
     '''
     sel = Selector(response)
     csrfToken = sel.css("input#j-csrf::attr(value)").extract()[0].strip()
     name = "".join(sel.css('h1.m-source-title::text').extract()).strip()
     bookId = response.url.split("/")[-1]
     item = CartoonItem()
     item['name'] = "".join(sel.css('h1.m-source-title::text').extract()).strip()
     item['url'] = response.url
     item['hitNum'] = "".join(sel.css('div.g-cols--float>div.g-col:nth-of-type(1)>div.metadata:nth-of-type(2)::text').re(u'人气\:(.*)')).strip()
     searchObj = re.search(u'(.*)万', item['hitNum'])
     if searchObj:
         item['hitNum'] = int(float(searchObj.group(1)) * 10000)
     else:
         item['hitNum'] = int(item['hitNum'])
     item['collectionNum'] = -1
     item['likeNum'] = -1
     item['caiNum'] = -1
     item['webName'] = "网易漫画"
     item['crawlTime'] = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
     commentApiUrl = "http://manhua.163.com/comment/"+bookId+"/comments?csrfToken="+csrfToken+"&bookId="+bookId+"&page=1"
     request = scrapy.Request(commentApiUrl, callback = self.moreparse)
     request.meta['item'] = item
     return request
예제 #4
0
 def parse(self, response):
     #cmd = 'phantomjs constructDom.js "%s"' % response.url
     #stdout,stderr = subprocess.Popen(cmd,shell= True,stdout = subprocess.PIPE,stderr = subprocess.PIPE).communicate()
     #f = file('code.txt', 'w+')
     #f.writelines(stdout)
     #print (stdout)
     #sel = Selector(text=stdout)
     sel = Selector(response)
     item = CartoonItem()
     item['name'] = "".join(sel.css('ul.synopsises_font>li:nth-of-type(2)>span:nth-of-type(1)::text').extract()).strip()
     
     item['url'] = response.url
     item['hitNum'] = "".join(sel.css('ul.synopsises_font>li:nth-of-type(2)>span:nth-last-of-type(1)::text').extract()).strip()
     searchObj = re.search(u'(.*)万', item['hitNum'])
     if searchObj:
         item['hitNum'] = int(float(searchObj.group(1)) * 10000)
     else:
         item['hitNum'] = int(item['hitNum'])
     item['collectionNum'] = int("".join(sel.css('a#Mark2Pocket small::text').extract()).strip())
     item['commentNum'] = int(sel.css('div.wrap_left div.content_left2>span:nth-of-type(1)>span>a::text').re(u'全部(\d*)')[0])
     item['likeNum'] = int("".join(sel.css('a#DoLike small::text').extract()).strip())
     item['caiNum'] = -1
     item['webName'] = "sf互动传媒"
     item['crawlTime'] = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
     
     return item
예제 #5
0
    def parse(self, response):
        # 章节的链接地址数组
        urls = response.xpath(".//tr//dl[@id='comiclistn']/dd/a[2]/@href").extract()
        # 章节的名称数组
        dir_names = response.xpath(".//tr//dl[@id='comiclistn']/dd/a[1]/text()").extract()

        # 保存章节的链接和名称,并发送请求,传递item参数
        for index in range(len(urls)):
            item = CartoonItem()
            item['link_url'] = urls[index]
            item['dir_name'] = dir_names[index]
            yield scrapy.Request(url=item['link_url'], meta={'item': item}, callback=self.parsecartoon)
예제 #6
0
 def parse1(self, response):
     hxs = Selector(response)
     items = []
     urls = hxs.xpath('//dd/a[1]/@href').extract()  # 章节连接地址
     dir_names = hxs.xpath('//dd/a[1]/text()').extract()  # 章节名
     for index in range(len(urls)):  # 保存章节连接和章节名
         item = CartoonItem()
         item['link_url'] = self.server_link + urls[index]
         item['dir_name'] = dir_names[index]
         items.append(item)
     # 根据每个章节的连接 ,发送Request请求,传递  item参数
     for item in items[-13:-1]:
         yield scrapy.Request(url=item['link_url'],
                              meta={'item': item},
                              callback=self.parse2)
예제 #7
0
 def parse(self, response):
     '''
     cmd = 'phantomjs constructDom.js "%s"' % response.url
     stdout,stderr = subprocess.Popen(cmd,shell= True,stdout = subprocess.PIPE,stderr = subprocess.PIPE).communicate()
     #f = file('code.txt', 'w+')
     #f.writelines(stdout)
     #print (stdout)
     sel = Selector(text=stdout)
     '''
     item = CartoonItem()
     item['url'] = response.url
     item['name'] = re.search(u'comic_name\s*=\s*\'(.*)\'', response.body).group(1)
     typeId = re.search(u'obj_id\s*=\s*\"(\d*)', response.body).group(1)
     infoApiUrl = "http://i.dmzj.com/ajax/ding?callback=json&typeid=" + typeId
     request = scrapy.Request(infoApiUrl, callback = self.moreparse)
     request.meta['typeId'] = typeId
     request.meta['item'] = item
     return request
     #sel = Selector(response)
     '''
 def parse(self, response):
     '''
     cmd = 'phantomjs constructDom.js "%s"' % response.url
     stdout,stderr = subprocess.Popen(cmd,shell= True,stdout = subprocess.PIPE,stderr = subprocess.PIPE).communicate()
     f = file('code.txt', 'w+')
     f.writelines(stdout)
     #print (stdout)
     sel = Selector(text=stdout)
     '''
     sel = Selector(response)
     item = CartoonItem()
     item['name'] = "".join(
         sel.css('div.weizhi::text').re(u'>>(.*)')).strip()
     item['url'] = response.url
     item['likeNum'] = -1
     item['caiNum'] = -1
     item['webName'] = "捧秀漫画"
     kid = response.url.split('/')
     commentApiUrl = "http://www.pengxiu.com/comment.do?doing=comment_web_ajaxlook2&kind=book&kid=" + kid[
         4]
     request = scrapy.Request(commentApiUrl, callback=self.moreparse)
     request.meta['item'] = item
     request.meta['kid'] = kid[4]
     return request