コード例 #1
0
 def parse(self, response):
     items = []
     for info in response.xpath('//div[@class="item"]'):
         item = MovieItem()
         item['rank'] = info.xpath(
             'div[@class="pic"]/em/text()').extract_first()
         item['title'] = info.xpath(
             'div[@class="info"]/div[@class="hd"]/a/span/text()'
         ).extract_first()
         # item['link'] = info.xpath('/div[@class="pic"]/a/img/@src').extract_first()
         item['star'] = info.xpath(
             'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
         ).extract_first()
         item['rate'] = info.xpath(
             'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[last()]/text()'
         ).extract_first()
         item['quote'] = info.xpath(
             'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()'
         ).extract_first()
         yield item
     #翻页
     next_page = response.xpath('//span[@class="next"]/a/@href')
     if next_page:
         url = response.urljoin(next_page[0].extract())
         yield scrapy.Request(url, self.parse)
コード例 #2
0
    def parse(self, response):
        # print response.text

        movies = response.xpath(
            '//*[@id="content"]//div[@class="article"]//ol[@class="grid_view"]/li'
        )
        for movie in movies:
            movie_item = MovieItem()
            movie_item["name"] = movie.xpath(
                './/div[@class="hd"]/a/span[@class="title"]/text()'
            ).extract_first().strip()
            movie_item["description"] = movie.xpath(
                './/div[@class="bd"]/p/text()').extract_first().strip(
                    u'"').strip()
            print movie_item["description"]
            yield movie_item

        # Go to next page.
        next_link = response.xpath(
            '//*[@id="content"]//div[@class="article"]//span[@class="next"]/link/@href'
        ).extract()
        if next_link:
            yield scrapy.Request(TopmoviesSpider.start_urls[0] + next_link[0],
                                 callback=self.parse,
                                 dont_filter=True)
コード例 #3
0
 def parse(self, response):
     s = response.url
     movie_jsons = json.loads(response.text)
     if 'data' not in movie_jsons or len(movie_jsons['data']) == 0:
         ks = re.findall('tags=(.*),(.*)&start', s)[0]
         self.rm.rdc.sadd(RedisKeyEnum.over_movie_tag_set.value, ks)
         return
     movie_list = movie_jsons['data']
     for movie in movie_list[0:1]:
         movie_item = MovieItem()
         if 'rate' not in movie or movie['rate'] is '':
             movie_item['score'] = -1
         else:
             movie_item['score'] = movie['rate']
         movie_item['url'] = movie['url']
         movie_item['title'] = movie['title']
         movie_item['directors'] = ','.join(movie['directors'])
         movie_item['actors'] = ','.join(movie['casts'])
         movie_item['cover_url'] = movie['cover']
         movie_item['douban_movie_id'] = movie['id']
         yield movie_item
     url = response.url
     page_no = int(re.findall('start=(\\d+)', url)[0])
     url = '{}{}'.format(url[:url.index('start=') + 6], page_no + 1)
     yield scrapy.Request(url=url, callback=self.parse)
コード例 #4
0
ファイル: doubanSpider.py プロジェクト: LuVsLu/douban-movie
    def parse_json(self, response):
        #解析response中的字符串
        json_str = response.body
        jsonDict = json.loads(json_str)
        #如果没有返回json,结束遍历
        if jsonDict is None:
            return

        for subject in jsonDict["subjects"]:
            item = MovieItem()
            item['title'] = subject['title']
            item['url'] = subject['url']
            item['rate'] = subject['rate']
            item['cover_x'] = subject['cover_x']
            item['is_beetle_subject'] = subject['is_beetle_subject']
            item['playable'] = subject['playable']
            item['cover'] = subject['cover']
            item['id'] = subject['id']
            item['cover_y'] = subject['cover_y']
            item['is_new'] = subject['is_new']
            yield item

            url = item['url'] + "comments?status=P"
            yield scrapy.Request(
                url,
                meta={'cookiejar': response.meta['cookiejar']},
                headers=self.headers,
                callback=self.parse_item)
コード例 #5
0
 def parse_movie_item(self, response):
     item = MovieItem()
     item['url'] = response.url
     item['name'] = response.xpath(
         '//div[@id=content]/h1/span/text()').extract_first()
     item['summary'] = response.xpath(
         '//span[@property="v:summary"]/text()').extract_first()
     item['score'] = response.xpath(
         '//strong[contains(@class, "rating_num")]/text()').extract_first()
     return item
コード例 #6
0
 def parse(self, response):
     li_list = response.xpath('/html/body/div[3]/div[1]/div/div[1]/ol/li')
     for li in li_list:
         item = MovieItem()
         item['title'] = li.xpath('div/div[2]/div[1]/a/span[1]/text()').extract_first()
         item['score'] = li.xpath('div/div[2]/div[2]/div/span[2]/text()').extract_first()
         item['motto'] = li.xpath('div/div[2]/div[2]/p[2]/span/text()').extract_first()
         yield item
     href_list = response.css('a[href]::attr("href")').re('\?start=.*')
     for href in href_list:
         url = response.urljoin(href)
         yield scrapy.Request(url=url, callback=self.parse)
コード例 #7
0
ファイル: movie.py プロジェクト: slg9892/scrapy-douban
    def parse_item(self, response):
        item = MovieItem()
        item['title'] = response.xpath(
            "//div[@id='content']/h1/span[1]/text()").extract()[0]
        item['url'] = response.url
        try:
            item['desc'] = response.xpath(
                "//div[@id='link-report']/span/text()").extract()[0].strip()
        except:
            item['desc'] = ''
        try:
            item['score'] = response.xpath(
                "//strong[@class='ll rating_num']/text()").extract()[0]
        except:
            item['score'] = 0
        item['image_urls'] = response.xpath(
            "//div[@id='mainpic']/a[@class='nbgnbg']/img/@src").extract()

        print item['title'], item['score'], item['url'], item['desc']
        yield item
コード例 #8
0
 def parse(self, response):
     # self.logger.info('parse_item function called on %s', response.url)
     for info in response.xpath('//div[@class="item"]'):
         item = MovieItem()
         item['rank'] = int(
             info.xpath('div[@class="pic"]/em/text()').extract_first())
         item['title'] = info.xpath(
             'div[@class="info"]/div[@class="hd"]/a/span/text()'
         ).extract_first()
         # item['link'] = info.xpath('/div[@class="pic"]/a/img/@src').extract_first()
         item['star'] = info.xpath(
             'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
         ).extract_first()
         item['rate'] = info.xpath(
             'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[last()]/text()'
         ).extract_first()
         item['quote'] = info.xpath(
             'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()'
         ).extract_first()
         item['detail'] = {}
         yield item
コード例 #9
0
    def parse(self, response):
        item = MovieItem()
        movie_url = response.url
        #   使用正则表达式匹配url中的movie_id
        pattern = re.compile(r'\d+(\\.\\d+){0,1}')
        item['movie_id'] = pattern.search(movie_url).group()
        item['movie_name'] = response.xpath(
            '//div[@id = "content"]/h1/span/text()').extract_first()
        #   爬取的电影上映年份格式为“(2021)”,使用正则表达式匹配括号中的年份
        year = response.xpath(
            '//div[@id = "content"]/h1/span/text()').extract()[1]
        pattern = re.compile(r'(?<=\()[^}]*(?=\))')
        item['movie_year'] = pattern.search(year).group()
        #   爬取电影信息,因直接爬取的数据中存在空格及换行符(\n),使用replace()进行清洗
        movie_info = response.xpath('//div[@id = "info"]//text()').extract()
        item['movie_info'] = ''.join(movie_info).replace(' ',
                                                         '').replace('\n', '')
        item['rating_num'] = response.xpath(
            '//strong[@class="ll rating_num"]/text()').extract_first()
        #   爬取豆瓣星级评分,因数据中没有直接的数字可获取,获取div中的class属性使用正则表达式取出数字进行处理获取星级评分
        #   以《你好,李焕英》为例,获取的class属性值为“ll bigstar bigstar40”,40表示该电影星级评分为4星
        rating_star = response.xpath(
            '//div[@class = "rating_right "]/div/@class').extract_first()
        pattern = re.compile(r'\d+(\\.\\d+){0,1}')
        if pattern.search(rating_star):
            ranting = pattern.search(rating_star).group()
            #   数值类型设置为float的原因为星级可能会出现半星的情况,如3.5星
            item['rating'] = float(ranting) / 10
        else:
            item['rating'] = None
        item['rating_sum'] = response.xpath(
            '//div[@class = "rating_sum"]//span/text()').extract_first()
        #   与爬取电影信息的原因相同,清洗掉数据中的空格及换行符(\n)
        rating_info = response.xpath(
            '//div[@class = "ratings-on-weight"]//text()').extract()
        item['rating_info'] = ''.join(rating_info).replace(' ', '').replace(
            '\n', '')

        yield item
コード例 #10
0
    def parse(self, response):
        # 解析页面
        li_list = response.xpath('//*[@id="content"]/div/div[1]/ol/li')
        for li in li_list:

            item = MovieItem()
            # 电影名字,肖申克救赎
            item['movie'] = li.xpath(
                'div/div[2]/div[1]/a/span[1]/text()').extract_first()
            # 评分, 9.6
            item['rate'] = li.xpath(
                'div/div[2]/div[2]/div/span[2]/text()').extract_first()
            # 说明,'希望让人自由'
            item['motto'] = li.xpath(
                'div/div[2]/div[2]/p[2]/span/text()').extract_first()
            yield item
        # 带href的 a 标签, ::attr("href")即取出属性是href--超链接
        href_list = response.css('a[href]::attr("href")').re('\?start=.*')
        for href in href_list:
            # 将超链接补完整
            url = response.urljoin(href)
            # 返回request 对象,给一个新的url,处理完url还是执行parse方法
            yield scrapy.Request(url=url, callback=self.parse)
コード例 #11
0
ファイル: moives.py プロジェクト: xxiaocheng/douban-spider
    def parse_movie(self, response):
        try:
            json_response = json.loads(response.body_as_unicode())
        except:
            logging.error("Can't parse this response to json ,url:%s !" %
                          response.url)
        item = MovieItem()
        attributes = [
            'id', 'title', 'subtype', 'wish_count', 'do_count',
            'collect_count', 'year', 'images', 'seasons_count',
            'episodes_count', 'countries', 'genres', 'current_season',
            'original_title', 'summary', 'comments_count', 'ratings_count',
            'aka', 'rating'
        ]
        for attribute in attributes:
            item[attribute] = json_response.get(attribute, None)
        item['rating'] = json_response['rating']['average']

        # 获取directors
        directors = []
        for t in json_response['directors']:
            directors.append(t['id'])
            celebrity = Subject(t['id'], category='CELEBRITY')
            yield scrapy.Request(url=celebrity.get_subject(),
                                 callback=self.parse_celebrity)
        item['directors'] = directors

        # 获取casts
        casts = []
        for tt in json_response['casts']:
            casts.append(tt['id'])
            celebrity = Subject(tt['id'], category='CELEBRITY')
            yield scrapy.Request(url=celebrity.get_subject(),
                                 callback=self.parse_celebrity)
        item['casts'] = casts
        yield item
コード例 #12
0
 def parse_item(self, response):
     """
     简介抓取
     :param response:
     :return:
     """
     url = response.url
     self.logger.info('Crawl {}'.format(url))
     item = MovieItem()
     item['url'] = url
     item['no'] = response.xpath(
         '//span[@class="top250-no"]/text()').extract_first()
     item['name'] = response.xpath('//h1/span[1]/text()').extract_first()
     intro_div = response.xpath('//div[@class="subject clearfix"]')
     item['main_picture'] = intro_div.xpath(
         './/div[@id="mainpic"]/a/@href').extract_first()
     info_div = intro_div.xpath('.//div[@id="info"]')
     # 导演
     director_url = info_div.xpath('./span[1]//a/@href').extract()
     director_name = replace_dot(
         info_div.xpath('./span[1]//a/text()').extract())
     director_url = [response.urljoin(url) for url in director_url]
     item['director'] = dict(zip(director_name, director_url))
     # 编剧
     scriptwriter_url = info_div.xpath('./span[2]//a/@href').extract()
     scriptwriter_name = replace_dot(
         info_div.xpath('./span[2]//a/text()').extract())
     scriptwriter_url = [response.urljoin(url) for url in scriptwriter_url]
     item['scriptwriter'] = dict(zip(scriptwriter_name, scriptwriter_url))
     # 主演
     actor_url = info_div.xpath('./span[3]//a[not(@title)]/@href').extract()
     actor_name = replace_dot(
         info_div.xpath('./span[3]//a[not(@title)]/text()').extract())
     actor_url = [response.urljoin(url) for url in actor_url]
     item['actor'] = dict(zip(actor_name, actor_url))
     # 剧情
     item['plot'] = '/'.join(
         info_div.xpath('.//span[@property="v:genre"]/text()').extract())
     # 制片国家/地区, 语言, 又名
     texts = info_div.xpath('./text()').extract()
     texts = [t.strip() for t in texts if t.strip() not in ('', ' ', '/')]
     item['made_in'] = texts[0]
     item['language'] = texts[1]
     if len(texts) == 3:
         item['another_names'] = texts[2]
     # 上映日期
     item['release_date'] = '/'.join(
         info_div.xpath('.//span[@property="v:initialReleaseDate"]/@content'
                        ).extract())
     # 片长
     item['runtime'] = info_div.xpath(
         './/span[@property="v:runtime"]/text()').extract_first()
     # IMDB链接
     item['imdb'] = response.urljoin(
         info_div.xpath('.//a[last()]/@href').extract_first())
     # 豆瓣评分
     average = response.xpath(
         '//strong[@property="v:average"]/text()').extract_first()
     rating_people = {
         response.xpath('//span[@property="v:votes"]/text()').extract_first(
         ):
         response.urljoin('collections')
     }
     star_titles = response.xpath(
         '//div[@class="ratings-on-weight"]//span[@title]/text()').extract(
         )
     star_titles = [t.strip() for t in star_titles if t is not None]
     star_weights = response.xpath(
         '//div[@class="ratings-on-weight"]//span[@class="rating_per"]/text()'
     ).extract()
     item['rating_avg'] = {
         "average": average,
         "rating_people": rating_people,
         "star_weight": dict(zip(star_titles, star_weights))
     }
     # 剧情简介
     summary = ''.join(
         response.xpath(
             '//div[@id="link-report"]//span[@property="v:summary"]').xpath(
                 'string(.)').extract()).strip()
     all_info = ''.join(
         response.xpath(
             '//div[@id="link-report"]//span[@class="all hidden"]').xpath(
                 'string(.)').extract()).strip()
     item['related_info'] = summary + ' ' + all_info
     # 喜欢该影片的人也喜欢
     recomm_names = replace_dot(
         response.xpath(
             '//div[@id="recommendations"]//dd/a/text()').extract())
     recomm_urls = response.xpath(
         '//div[@id="recommendations"]//dd/a/@href').extract()
     item['recommendations'] = dict(zip(recomm_names, recomm_urls))
     yield item
コード例 #13
0
ファイル: movie.py プロジェクト: hhzh/douban
    def parse(self, response):
        item = MovieItem()
        id = response.xpath(
            '//meta[@name="mobile-agent"]/@content').extract_first("")
        title = response.xpath(
            '//*[@id="content"]/h1/span[1]/text()').extract_first("")
        # original_title = response.xpath('').extract_first("")
        aka = response.xpath('//div[@id="info"]/text()[5]').extract_first("")
        # alt = response.xpath('').extract_first("")
        # mobile_url = response.xpath('').extract_first("")
        rating = response.xpath(
            '//strong[contains(@class,"rating_num")]/text()').extract_first("")
        ratings_count = response.xpath(
            '//a[@class="rating_people"]/span/text()').extract_first("")
        wish_count = response.xpath(
            '//div[@class="subject-others-interests-ft"]/a[contains(@href,"/wishes")]/text()'
        ).extract_first("")
        collect_count = response.xpath(
            '//div[@class="subject-others-interests-ft"]/a[contains(@href,"/collections")]/text()'
        ).extract_first("")
        # do_count = response.xpath('').extract_first("")
        images = response.xpath(
            '//div[@id="mainpic"]/a/img/@src').extract_first("")
        # subtype = response.xpath('').extract_first("")
        directors = response.xpath(
            '//div[@id="info"]/span/span[@class="attrs"]/a[@rel="v:directedBy"]/text()'
        ).extract()
        casts = response.xpath(
            '//span[@class="actor"]/span[@class="attrs"]/span/a[@rel="v:starring"]/text()'
        ).extract()
        writers = response.xpath(
            '//div[@id="info"]/span[2]/span[@class="attrs"]/a/text()').extract(
            )
        # website = response.xpath('').extract_first("")
        # douban_site = response.xpath('').extract_first("")
        pubdates = response.xpath(
            '//div[@id="info"]/span[@property="v:initialReleaseDate"]/text()'
        ).extract_first("")
        # mainland_pubdate = response.xpath('').extract_first("")
        year = response.xpath(
            '//*[@id="content"]/h1/span[2]/text()').extract_first("")
        genres = response.xpath(
            '//div[@id="info"]/span[@property="v:genre"]/text()').extract()
        # countries = response.xpath('').extract_first("")
        # languages = response.xpath('').extract_first("")
        # release_date = response.xpath('').extract_first("")
        durations = response.xpath(
            '//div[@id="info"]/span[@property="v:runtime"]/text()'
        ).extract_first("")
        summary = response.xpath(
            '//*[@id="link-report"]/span[@property="v:summary"]/text()'
        ).extract()
        comments_count = response.xpath(
            '//*[@id="comments-section"]/div[@class="mod-hd"]/h2/span[@class="pl"]/a/@href'
        ).extract_first("")
        reviews_count = response.xpath(
            '//*[@id="content"]/div/div/section/header/h2/span/a[@href="reviews"]/text()'
        ).extract_first("")
        # seasons_count = response.xpath('').extract_first("")
        # current_season = response.xpath('').extract_first("")
        # episodes_count = response.xpath('').extract_first("")
        # schedule_url = response.xpath('').extract_first("")
        trailer_urls = response.xpath(
            '//*[@id="related-pic"]/ul/li[1]/a[@clas="related-pic-video"]/@href'
        ).extract_first("")
        # clip_urls = response.xpath('').extract_first("")
        # blooper_urls = response.xpath('').extract_first("")
        photos = response.xpath(
            '//*[@id="related-pic"]/ul/li[2]/a/img[@alt="图片"]/@src').extract()
        popular_reviews = response.xpath(
            '//*[@id="hot-comments"]/div[1]/div/p/text()').extract_first("")

        id_match_re = re.match('.*com/movie/subject/(\d+)/.*', id)
        id_content = 0
        if id_match_re:
            id_content = int(id_match_re.group(1))
        item['id'] = id_content
        item['title'] = title
        item['aka'] = aka
        item['rating'] = rating
        item['ratings_count'] = ratings_count
        item['wish_count'] = wish_count
        item['collect_count'] = collect_count
        item['images'] = images
        item['directors'] = '/'.join(directors)
        item['casts'] = '/'.join(casts)
        item['writers'] = '/'.join(writers)
        item['pubdates'] = pubdates
        item['year'] = year
        item['genres'] = '/'.join(genres)
        item['durations'] = durations
        item['summary'] = '/'.join(summary)
        item['comments_count'] = comments_count
        item['reviews_count'] = reviews_count
        item['trailer_urls'] = trailer_urls
        item['photos'] = '/'.join(photos)
        item['popular_reviews'] = popular_reviews

        yield item

        next_urls = response.xpath(
            '//div[@id="recommendations"]/div[@class="recommendations-bd"]/dl/dt/a/@href'
        ).extract()
        for next_url in next_urls:
            print('next_url:%s' % next_url)
            yield Request(url=next_url, callback=self.parse)