Beispiel #1
0
 def parse_grade(self, response):
     selector = Selector(response)
     item = ReYingMovie()
     item["name"] = selector.xpath('//h1/text()').extract_first()
     item["createdtime"] = str(datetime.now())
     item["comefrom"] = "格瓦拉"
     item["filmid"] = re.findall(r'\d+', response.url)[0]
     item["crawldate"] = str(datetime.today())
     try:
         item["movieDate"] = selector.xpath(
             '//div[@id="ui_movieInfo_open"]/div/ul/li[@class="first"]/text()'
         ).extract_first()[5:]
         item["Grade"] = selector.xpath(
             '//span[@class="point"]/text()').extract_first()
         item["gradePeople"] = selector.xpath(
             '//span[@class="txt"]/em/text()').extract_first()[2:-1]
         rating = selector.xpath('//span[@class="pect"]/text()').extract()
         item["five"] = rating[0][1:-1]
         item["four"] = rating[1][1:-1]
         item["three"] = rating[2][1:-1]
         item["two"] = rating[3][1:-1]
         item["one"] = rating[4][1:-1]
     except IndexError:
         item["movieDate"] = "暂无"
         item["doubanGrade"] = "暂无"
         item["gradePeople"] = "暂无"
         item["five"] = "暂无"
         item["four"] = "暂无"
         item["three"] = "暂无"
         item["two"] = "暂无"
         item["one"] = "暂无"
     return item
Beispiel #2
0
    def parse_futuregrade(self, response):
        data = json.loads(response.text)

        for i in data["content"]:
            item = ReYingMovie()
            item["name"] = i["name"]
            item["comefrom"] = "微博"
            item["filmid"] = i["film_id"]
            item["crawldate"] = str(datetime.today())
            item["createdtime"] = str(datetime.now())
            item["movieDate"] = i["release_time"]
            item["want"] = i["want_number"]
            yield item
Beispiel #3
0
    def parse(self, response):
        jsonstr = re.findall('var result_\d+ = (.*);var', response.text)[0]
        item = ReYingMovie()
        for i in json.loads(jsonstr)['value']['hotplayRatingList']:
            item["filmid"] = i['Id']
            url = self.detail_url.format(str(item['filmid']))
            # print(url)
            yield scrapy.Request(url, callback=self.parse_grade)

        for i in json.loads(jsonstr)['value']['upcomingTicketList']:
            item["filmid"] = i['Id']

            url = self.detail_url.format(str(item['filmid']))
            # print(url)
            yield scrapy.Request(url, callback=self.parse_grade)
Beispiel #4
0
 def parse_grade(self, response):
     data = json.loads(response.text)
     for i in data["content"]:
         item = ReYingMovie()
         item["name"] = i["trendinfo"]["name"]
         item["createdtime"] = str(datetime.now())
         item["movieDate"] = i["release_date"]
         item["want"] = i["want_number"]
         item["comefrom"] = "微博"
         item["filmid"] = i["film_id"]
         item["crawldate"] = str(datetime.today())
         item["Grade"] = i["markinfo"]["score"]
         item["gradePeople"] = i["markinfo"]["score_count"]
         item["good"] = format(i["markinfo"]["good_rate"], '.0%')
         item["bad"] = format(i["markinfo"]["bad_rate"], '.0%')
         yield item
Beispiel #5
0
 def parse_grade(self, response):
     get_data = re.findall(r'({.*})', response.text)[0]
     data = json.loads(get_data)["html"]
     page_source = etree.HTML(data)
     title = page_source.xpath('//a[@class="movie-pic"]/img/@alt')
     grade = page_source.xpath('//span[@class="num nuomi-red"]/text()')
     moviedate = page_source.xpath('///ul[@class="info"]/li[3]/text()')
     movieid = page_source.xpath('//a[@class="movie-pic"]/@data-data')
     for i in range(0, 10):
         item = ReYingMovie()
         item["name"] = title[i].strip()
         item["comefrom"] = "糯米"
         item["movieDate"] = moviedate[i].strip()[5:]
         item["Grade"] = grade[i].strip()
         item["createdtime"] = str(datetime.now())
         item["filmid"] = re.findall(r"(\d+)", movieid[i])[0]
         item["crawldate"] = str(datetime.today())
         yield item
Beispiel #6
0
 def parse_grade(self, response):
     selector = Selector(response)
     item = ReYingMovie()
     item["name"] = selector.xpath(
         '//span[@property="v:itemreviewed"]/text()').extract_first()
     item["createdtime"] = str(datetime.now())
     item["comefrom"] = "豆瓣"
     item["filmid"] = selector.xpath(
         '//span[@class="rec"]/a/@share-id').extract_first()
     item["crawldate"] = str(datetime.today())
     try:
         item["movieDate"] = selector.xpath(
             '//span[@property="v:initialReleaseDate"]/text()'
         ).extract_first()
         item["Grade"] = selector.xpath(
             '//strong[@property]/text()').extract_first()
         item["gradePeople"] = selector.xpath(
             '//span[@property="v:votes"]/text()').extract_first()
         rating = selector.xpath(
             '//span[@class="rating_per"]/text()').extract()
         if len(rating) > 0:
             item["five"] = rating[0]
             item["four"] = rating[1]
             item["three"] = rating[2]
             item["two"] = rating[3]
             item["one"] = rating[4]
         item["want"] = selector.xpath(
             '//div[@class="subject-others-interests-ft"]/a[2]/text()'
         ).extract_first()[:-3]
     except IndexError:
         item["movieDate"] = selector.xpath(
             '//span[@property="v:initialReleaseDate"]/text()'
         ).extract_first()
         # item["Grade"] = ""
         # item["gradePeople"] = ""
         # item["five"] = ""
         # item["four"] = ""
         # item["three"] = ""
         # item["two"] = ""
         # item["one"] = ""
         item["want"] = selector.xpath(
             '//div[@class="subject-others-interests-ft"]/a[2]/text()'
         ).extract_first()[:-3]
     return item
Beispiel #7
0
 def parse_grade(self, response):
     data = json.loads(re.findall(r'({.*})', response.text)[0])
     #print(data)
     #print(data['value']['movieRating']['RatingFinal'])
     item = ReYingMovie()
     item["createdtime"] = str(datetime.now())
     item["comefrom"] = "时光"
     item['name'] = data['value']['movieTitle']
     item["Grade"] = data['value']['movieRating']["RatingFinal"]
     if item["Grade"] < 0:
         item["Grade"] = None
     item["gradePeople"] = data['value']['movieRating']["Usercount"]
     item["want"] = data['value']['movieRating']["AttitudeCount"]
     item["music"] = data['value']['movieRating']["ROtherFinal"]
     item["frames"] = data['value']['movieRating']["RPictureFinal"]
     item["story"] = data['value']['movieRating']["RStoryFinal"]
     item["director"] = data['value']['movieRating']["RDirectorFinal"]
     item["filmid"] = data['value']['movieRating']['MovieId']
     item["crawldate"] = str(datetime.today())
     #print(item)
     return item
Beispiel #8
0
    def parse_grade(self,response):
        item =ReYingMovie()
        woff =response.xpath("//style/text()").extract_first()
        woffurl = 'http://'+re.findall(r'url\(\'//(.*?.woff)',woff)[0]
        woffdata = requests.get(woffurl).content
        b64 = base64.b64encode(woffdata)
        fontMapping = extract_fonts(b64)
        item["name"] = response.xpath('//h3[@class="name"]/text()').extract_first()
        item["createdtime"]=str(datetime.now())
        item["movieDate"]=response.xpath('//li[@class="ellipsis"][3]/text()').extract_first()[:-4]
        item["comefrom"]="猫眼"
        item["filmid"] = re.findall(r'\d+', response.url)[0]
        item["crawldate"] = str(datetime.today())
        people = response.xpath(
            '//div[@class="movie-index-content score normal-score"]/div/span/span/text()').extract_first()
        if people!=None:
            gradeor = response.xpath(
                '//div[@class="movie-index-content score normal-score"]/span/span/text()').extract_first()
            grade = self.decode_value(fontMapping, gradeor)
            item["Grade"] = grade
            people = response.xpath(
                '//div[@class="movie-index-content score normal-score"]/div/span/span/text()').extract_first()
            realpeo = self.decode_value(fontMapping, people)
            item["gradePeople"] = realpeo
            piaofang = response.xpath('//div[@class="movie-index"][2]/div/span/text()').extract_first()
            realpiao = self.decode_value(fontMapping, piaofang) + response.xpath('//div[@class="movie-index"][2]/div/span[2]/text()').extract_first()
            item['piaofang'] = realpiao
            return item

        else:
            item["name"] = response.xpath('//h3[@class="name"]/text()').extract_first()
            item["createdtime"] = str(datetime.now())

            item["movieDate"] = response.xpath('//li[@class="ellipsis"][3]/text()').extract_first()[:-4]
            want=response.xpath('//div[@class="movie-index-content score normal-score"]/span/span/text()').extract_first()
            wantpeople=self.decode_value(fontMapping,want)
            item["want"]=wantpeople
            item["comefrom"]="猫眼"
            print(item)
            return item
Beispiel #9
0
    def parse_futuregrade(self, response):
        item = ReYingMovie()
        item["name"] = response.xpath('//h1/text()').extract_first()
        item["createdtime"] = str(datetime.now())
        item["comefrom"] = "格瓦拉"
        item["filmid"] = re.findall(r'\d+', response.url)[0]
        item["crawldate"] = str(datetime.today())
        movieDate = response.xpath(
            '//div[@id="ui_movieInfo_open"]/div/ul/li[@class="first"]/text()'
        ).extract_first()
        print(movieDate)
        if movieDate == None:
            movieDate = response.xpath(
                '//div[@class="toggleInfo clear"]/span[2]/text()'
            ).extract_first()
        if movieDate == None:
            movieDate = response.xpath(
                '//div[@class="toggleInfo clear"]/span[3]/text()'
            ).extract_first()
        item["movieDate"] = movieDate[5:]
        item["want"] = response.xpath(
            '//span[@class="focusCount"]/text()').extract_first()[:-2]

        return item