Example #1
0
 def parse_detail(self, response):
     type_name = None
     for k,v in self.movie_category.items():
         if str(v) == re.search('type=(.*?)&', unquote(response.url)).group(1):
             type_name = k
     datas = json.loads((response.text))
     for data in datas:
         type_name = type_name
         titie = data.get('title')
         rank = data.get('rank')
         url = data.get('url')
         actors = data.get('actors')
         cover_url = data.get('cover_url')
         regions = data.get('regions')
         release_date = data.get('release_date')
         score = data.get('score')
         types = data.get('types')
         vote_count = data.get('vote_count')
         douban_item = DoubanMovieItem()
         for field in douban_item.fields:
             try:
                 douban_item[field] = eval(field)
             except:
                 print('Field is Not Defined', field)
         yield douban_item
Example #2
0
 def parse(self, response):
     #print(response.body.decode())
     item=DoubanMovieItem()
     #movie_list=json.loads(response.body.decode())
     # chrome_options = Options()
     # chrome_options.add_argument('--headless')
     # chrome_options.add_argument('--disable-gpu')
     #chrome_options.add_argument('--no-sandbox')
     #chrome_driver=r'C:\Users\igogle\Desktop\douban2\douban_movie\chromedriver.exe'
     #browser=webdriver.Chrome(executable_path=chrome_driver)
     #browser.get('https://www.amazon.com/ref=nav_logo')
     #browser.implicitly_wait(5)
     
     print('s')
     print('s')
     #movies=browser.find_element_by_xpath('//*[@id="desktop-4"]')
     #test=movies[0].get_attribute("li")
     movies= response.xpath('//*[@id="desktop-4"]//li/@data-sgproduct')
     print('s')
     print('s')
     print(movies)
     print(str(len(movies)))
     print('s')
     print('s')
     #print(response.xpath('//*[@id="content"]/div/div[1]/div/div[4]/div/a[1]/p/text()'))
      for movie in movies:
          print('s')
          print(movie.xpath('//p/text()').extract()[0])
         #names=movie.xpath('.//p/text()').extract()[0]
         #item['name']=names
         #item['score']=movie['score']
         #item['url']=movie['url']
         #yield item
Example #3
0
    def parse_item(self, response):
        sel = Selector(response)
        item = DoubanMovieItem()
        title = sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
        director = sel.xpath(
            '//*[@id="info"]/span[1]/span[2]/a/text()').extract()
        actor = sel.xpath('//*[@id="info"]/span[3]/span[2]/a/text()').extract()
        #release_time = sel.xpath('//*[@id="info"]/span[11]/text()').extract()
        #time = sel.xpath('//*[@id="info"]/span[13]/text()').extract()
        star = sel.xpath(
            '//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract()

        item['title'] = title
        item['director'] = director
        item['actor'] = actor
        #item['release_time'] = release_time
        #item['time'] = time
        item['star'] = star

        yield item

        print(title)
        print(director)
        print(actor)
        # print(release_time)
        #print(time)
        print(star)
Example #4
0
    def parse(self, response):
        time.sleep(random.randint(2, 5))
        global START
        item = DoubanMovieItem()
        # https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%E7%94%B5%E5%BD%B1&start=20
        # while True:
        pat_title = r'title":"(.*?)"'
        pat_rate = r'rate":"(.*?)"'
        pat_url = r'url":"(.*?)"'
        item['title'] = re.compile(pat_title).findall(
            str(response.body.decode('utf-8')))
        item['rate'] = re.compile(pat_rate).findall(
            str(response.body.decode('utf-8')))
        item['url'] = re.compile(pat_url).findall(
            str(response.body.decode('utf-8')))
        print(item['title'])
        if item['title'] == []:
            exit()
        print('return item success')
        yield item
        print('yield item success')

        next_url = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%E7%94%B5%E5%BD%B1&start='
        START = START + 20
        print('next url :' + next_url + str(START))
        yield Request(
            next_url + str(START),
            callback=self.parse,
            headers={
                'User-Agent':
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
            },
            encoding='utf-8')
        print('yield request success')
Example #5
0
    def next_parse(self, response):
        item2 = DoubanMovieItem()
        print(item2)
        item2['_name'] = response.xpath(
            '//div[@id="content"]/h1/span[@property="v:itemreviewed"]/text()'
        ).getall()
        item2['_year'] = response.xpath(
            '//div[@id="content"]/h1/span[@class="year"]/text()').getall()
        item2['_pic'] = response.xpath(
            '//div[@id="mainpic"]/a/img/@src').getall()
        print(item2)

        selector_next = response.xpath('//div[@id="info"]')
        item2['_director'] = selector_next.xpath(
            '//span[@class="attrs"]/a/text()').getall()[:2]
        item2['_writer'] = selector_next.xpath(
            '//span[@class="attrs"]/a/text()').getall()[2:]
        item2['_cast'] = selector_next.xpath(
            '//span[@class="actor"]/span[@class="attrs"]/span/a/text()'
        ).getall()
        item2['_type'] = selector_next.xpath(
            'span[@property="v:genre"]/text()').getall()
        item2['_country'] = selector_next.xpath('text()').getall()[7]
        item2['_language'] = selector_next.xpath('text()').getall()[9]
        item2['_premiere'] = selector_next.xpath(
            'span[@property="v:initialReleaseDate"]/text()').getall()
        item2['_episode'] = selector_next.xpath('text()').getall()[13]
        item2['_runningtime'] = selector_next.xpath('text()').getall()[15]

        item2['_plot'] = response.xpath(
            '//span[@property="v:summary"]/text()').getall()

        return item2
Example #6
0
    def parse(self, response):
        selector = Selector(response)
        ol_li = selector.xpath('//div[@class="item"]')

        for li in ol_li:
            movie = DoubanMovieItem()
            movie['_id'] = str(ObjectId())
            movie['rank'] = li.xpath(
                'div[@class="pic"]/em/text()').extract_first()
            movie['link'] = li.xpath(
                'div[@class="pic"]/a/@href').extract_first()
            movie['img'] = li.xpath(
                'div[@class="pic"]/a/img/@src').extract_first()
            movie['title'] = li.xpath(
                'div[@class="pic"]/a/img/@alt').extract_first()
            movie['star'] = li.xpath(
                'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
            ).extract_first()
            movie['quote'] = li.xpath(
                'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()'
            ).extract_first()
            yield movie

        next_page = response.xpath('//span[@class="next"]/a/@href')
        if next_page:
            url = 'https://movie.douban.com/top250' + next_page[0].extract()
            yield Request(url=url, callback=self.parse)
Example #7
0
 def parse_movie(self, response):
     global cookies
     global headers
     # 从返回的正文中取出[{...}]之间的内容
     dict_str = re.search('\[\{.*}]', response.body).group()
     # 取出所有{。。。}之间的内容,是dict格式的字符串
     temp = re.findall('\{.*?}', dict_str)
     # 替换布尔量,否则报错
     temp1 = [x.replace('false', 'False') for x in temp]
     temp2 = [x.replace('true', 'True') for x in temp1]
     # 替换转义符号
     temp3 = [x.replace('\\', '') for x in temp2]
     # 使用eval把正确的字符串转为对应的类型
     dict_list = [eval(x) for x in temp3]
     for x in dict_list:
         item = DoubanMovieItem()
         item['title'] = x['title']
         item['post_urls'] = [
             x['cover'],
         ]
         # pdb.set_trace()
         yield scrapy.Request(
             url=x['url'],
             meta={'item': item},  # 通过meta把item传送到另外一个页面抓取中
             cookies=cookies,
             headers=headers,
             callback=self.parse_intro,
             # dont_filter=True,
         )
Example #8
0
    def parse_movie(self, response):
        hxs = HtmlXPathSelector(response)
        movie = DoubanMovieItem()
        movie['url_in_douban'] = unicode(response.url)
        movie['name_in_douban'] = "".join(
            hxs.select("//span[@property='v:itemreviewed']/text()").extract()
        ).strip()
        movie['year'] = ("".join(
            hxs.select("//span[@class='year']/text()").extract()).strip()
                         )[1:-1]
        movie['length'] = "".join(
            hxs.select(
                "//span[@property='v:runtime']/@content").extract()).strip()
        movie['url_in_imdb'] = "".join(
            hxs.select("//div[@id='info']/a[last()]/@href").extract()).strip()
        movie['score'] = "".join(
            hxs.select(
                "//strong[@class='ll rating_num'][@property='v:average']/text()"
            ).extract()).strip()
        movie['scored_num'] = "".join(
            hxs.select(
                "//span[@property='v:votes']/text()").extract()).strip()

        tag_names = hxs.select("//div[@class='tags-body']/a/text()").extract()
        tag_times = hxs.select(
            "//div[@class='tags-body']/a/span/text()").extract()
        tag_num = len(tag_names)
        tags = dict()
        for i in range(tag_num):
            tags[tag_names[i]] = tag_times[i][1:-1]
        movie['tags'] = tags
        return [movie]
    def parse_movie(self, response):
        print(response.status)
        print(response.xpath('//li/span[@class="rec"]/@id'))
        print(response.xpath('//span[@class="rating_per"]/text()'))
        _setDNSCache()
        movie_item = DoubanMovieItem()
        # movie id
        movie_item['movie_id'] = response.xpath(
            '//li/span[@class="rec"]/@id').extract()
        # movie title
        movie_item['movie_title'] = response.xpath(
            '//*[@id="content"]/h1/span[1]').extract()
        # release_date
        movie_item['release_date'] = response.xpath(
            './/h1/span[@class="year"]/text()').extract()
        # 导演
        movie_item['directedBy'] = response.xpath(
            './/a[@rel="v:directedBy"]/text()').extract()
        # 电影主演
        movie_item['starring'] = response.xpath(
            './/a[@rel="v:starring"]/text()').extract()
        # 电影类别
        movie_item['genre'] = response.xpath(
            './/span[@property="v:genre"]/text()').extract()
        # 电影时长
        movie_item['runtime'] = response.xpath(
            './/span[@property="v:runtime"]/text()').extract()
        # # 电影的国别和语言
        # temp = response.xpath('.//div[@id="info"]/text()').extract()
        # movie_item['country'] = [p for p in temp if (p.strip() != '') & (p.strip() != '/')][0].strip()
        # movie_item['language'] = [p for p in temp if (p.strip() != '') & (p.strip() != '/')][1].strip()
        # 电影的评分
        movie_item['rating_num'] = response.xpath(
            './/strong[@class="ll rating_num"]/text()').extract()
        # 评分的人数
        movie_item['vote_num'] = response.xpath(
            './/span[@property="v:votes"]/text()').extract()
        # 电影1-5星的百分比
        # movie_item['rating_per_stars5'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[0].strip()
        # movie_item['rating_per_stars4'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[1].strip()
        # movie_item['rating_per_stars3'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[2].strip()
        # movie_item['rating_per_stars2'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[3].strip()
        # movie_item['rating_per_stars1'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[4].strip()
        # 电影的剧情简介
        intro = response.xpath('.//span[@class="all hidden"]/text()').extract()
        if len(intro):
            movie_item['intro'] = intro
        else:
            movie_item['intro'] = response.xpath(
                './/span[@property="v:summary"]/text()').extract()
        # 电影的短评数
        # movie_item['comment_num'] = response.xpath('.//div[@class="mod-hd"]/h2/span/a/text()').extract()[0].strip()
        # # 电影的提问数
        # movie_item['question_num'] = response.xpath('.//div[@class="mod-hd"]/h2/span/a/text()').extract()[1].strip()

        # 最后输出
        yield movie_item
Example #10
0
    def parse(self, response):
        item = DoubanMovieItem()
        json_text = response.text
        movie_dict = json.loads(json_text)

        for one_movie in movie_dict["subjects"]:
            item["title"] = one_movie["title"]
            item["rate"] = one_movie["rate"]
            yield item
Example #11
0
 def parse_item(self, response):
     for item in response.xpath(
             '//div[@class="body-bg"]/div[@class="w1000"]/div/div/ul/li'):
         l = DoubanMovieItemLoader(DoubanMovieItem(), item)
         l.add_xpath('ebtang_id', './a/@href')
         l.add_xpath('title', './a/text()')
         l.add_xpath('date', './span/text()')
         l.add_value('crawl_time', datetime.now())
         yield l.load_item()
Example #12
0
 def parse(self, response):
     json_string = response.body.decode('utf-8')
     content = json.loads(json_string)
     for movie in content['subjects']:
         item = DoubanMovieItem()
         item['movie_info'] = movie['url']
         item['movie_pic'] = movie['cover']
         item['movie_title'] = movie['title']
         item['movie_score'] = movie['rate']
         yield item
Example #13
0
 def parse_movie_item(self, response):
     item = DoubanMovieItem()
     item['url'] = response.url
     item['name'] = response.xpath(
         '//span[@property="v:itemreviewed"]/text()').extract_first()
     item['summary'] = response.xpath(
         '//span[@property="v:summary"]/text()').extract_first()
     item['score'] = response.xpath(
         '//strong[@property="v:average"]/text()').extract_first()
     return item
Example #14
0
 def parse(self, response):
     movies = Selector(response=response).xpath('//div[@class="hd"]')
     for movie in movies:
         item = DoubanMovieItem()
         title = movie.xpath('./a/span[1]/text()').extract()
         link = str(
             movie.xpath('./a/@href').extract()[0]) + 'comments?status=P'
         item['title'] = title
         item['link'] = link
         yield scrapy.Request(url=link,
                              meta={'item': item},
                              callback=self.parse2)
Example #15
0
 def parse_item(self, response):
     i = {}
     #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
     #i['name'] = response.xpath('//div[@id="name"]').extract()
     #i['description'] = response.xpath('//div[@id="description"]').extract()
     i['url'] = response.url
     i['name'] = response.xpath(
         '//span[@property="v:itemreviewed"]/text()').extract_first()
     i['summary'] = response.xpath(
         '//span[@property="v:summary"]/text()').extract_first().strip()
     i['score'] = float(response.xpath('//strong/text()').extract_first())
     return DoubanMovieItem(i)
Example #16
0
 def parse_item(self, response):
     for item in response.xpath('//div[@id="content"]/div/div[1]/ol/li'):
         l = DoubanMovieItemLoader(DoubanMovieItem(), item)
         l.add_xpath('rank', './div/div/em/text()')
         l.add_xpath('picture', './div/div/a/img/@src')
         l.add_xpath('title', './div/div/div/a/span/text()')
         l.add_xpath('info', './div/div/div/p/text()')
         l.add_css('star', 'div.star span.rating_num::text')
         l.add_xpath('people', './div/div/div/div/span[4]/text()')
         l.add_css('quote', 'p.quote > span.inq::text')
         l.add_value('crawl_time', datetime.now())
         yield l.load_item()
Example #17
0
    def parse_page(self, response):

        item = DoubanMovieItem()
        try:
            soup = BeautifulSoup(response.body,
                                 'html.parser',
                                 from_encoding='utf-8')
            movie_name_tag = soup.find('div', id='content').findChild('h1')
            no = soup.find('span', 'top250-no').get_text()
            # no = response.xpath('//span[@class=top250-no]/text()').extract()
            movie_name = movie_name_tag.findChildren()[0].get_text(
            ) + movie_name_tag.findChildren()[1].get_text()
            # movie_name = response.xpath('//h1/span/text()').extract()
            # print(no,movie_name)
            director = soup.find('a', rel='v:directedBy').get_text()
            writer = soup.find('span',
                               text='编剧').next_sibling.next_sibling.text
            actor = '/'.join(star.text
                             for star in soup.findAll('a', rel='v:starring'))
            type = '/'.join(
                genre.text
                for genre in soup.findAll('span', property='v:genre'))
            region = soup.find('span', text='制片国家/地区:').next_sibling
            language = soup.find('span', text='语言:').next_sibling
            date = soup.find('span', property='v:initialReleaseDate').text
            length_tag = soup.find('span', property='v:runtime')
            if str(length_tag.next_sibling) != '<br/>':
                length = length_tag.text + str(length_tag.next_sibling)
            else:
                length = length_tag.text
            another_name = soup.find('span', text='又名:').next_sibling
            introduction = soup.find('span', property='v:summary').text
            grade = soup.find('strong', property='v:average').text
            comment_times = soup.find('span', property='v:votes').text

            item['no'] = no
            item['movie_name'] = movie_name
            item['director'] = director
            item['writer'] = writer
            item['actor'] = actor
            item['type'] = type
            item['region'] = region
            item['language'] = language
            item['date'] = date
            item['length'] = length
            item['another_name'] = another_name
            item['introduction'] = introduction
            item['grade'] = grade
            item['comment_times'] = comment_times
        except exception as e:
            print('Parse error:', e)

        return item
 def parse_item(self, response):
     yield DoubanMovieItem({
         'url':
         response.url,
         'name':
         response.xpath(
             '//span[@property="v:itemreviewed"]/text()').extract(),
         'summary':
         response.xpath(
             '//span[@property="v:summary"]/text()').extract_first(),
         'score':
         response.xpath(
             '//strong[@property="v:average"]/text()').extract_first()
     })
Example #19
0
 def parse(self, response):
     datas = json.loads(response.text).get('data')
     logger.info("request successful")
     for data in datas:
         item = DoubanMovieItem()
         for field in item.fields:
             if field in data:
                 item[field] = data[field]
         id = item['movie_id'] = data['id']
         if redis_db.add_movie_id(id):
             # logger.info('add %s in redis' %data['title'])
             yield item
         else:
             self.repeat_count += 1
Example #20
0
 def parse(self, response):
     item = DoubanMovieItem()
     divlist = response.xpath(r"//div[@class='pic']")
     for div in divlist:
         item['Url'] = div.xpath("a/@href").extract_first()
         item['Img_url'] = div.xpath('a/img/@src').extract_first()
         yield item
         yield scrapy.Request(url=item['Url'],
                              meta={'item': item},
                              callback=self.parse_movie)
     next_href = response.xpath('//link[@rel="next"]/@href').extract()
     if next_href:
         nexturl = 'http://movie.douban.com/top250' + next_href[0]
         yield scrapy.Request(url=nexturl, headers=self.headers)
Example #21
0
    def parse(self, response):
        sel = Selector(response)

        movie_name = sel.xpath("//div[@class='pl2']/a/text()").extract()
        movie_url = sel.xpath("//div[@class='pl2']/a/@href").extract()
        movie_star = sel.xpath(
            "//div[@class='pl2']/div/span[@class='rating_nums']/text()"
        ).extract()

        item = DoubanMovieItem()
        item['movie_name'] = [n for n in movie_name]
        item['movie_url'] = [n for n in movie_url]
        item['movie_star'] = [n for n in movie_star]

        yield item
Example #22
0
    def parse(self, response):
        datas = json.loads(response.body)
        item = DoubanMovieItem()
        if datas:
            for data in datas:
                item['ranking'] = data['rank']
                item['movie_name'] = data['title']
                item['score_num'] = data['vote_count']
                yield item

            # 如果datas存在数据则对下一页进行采集
            page_num = re.search(r'start=(\d+)', response.url).group(1)
            page_num = 'start=' + str(int(page_num) + 20)
            next_url = re.sub(r'start=\d+', page_num, response.url)
            yield Request(next_url, headers=self.headers)
Example #23
0
    def parse(self, response):
        item = DoubanMovieItem()

        movies = response.xpath('//ol[@class="grid_view"]/li')

        for movie in movies:

            item['rank'] = movie.xpath('.//div[@class="pic"]/em/text()').extract()[0]
            item['score'] = movie.xpath('.//div[@class="star"]/span[@class="rating_num"]/text()').extract()[0]
            item['name'] = movie.xpath('.//div[@class="hd"]/a/span[1]/text()').extract()[0]

            yield item
        next_url = response.xpath('//span[@class="next"]/a/@href').extract()
        if next_url:
            next_url = 'https://movie.douban.com/top250' + next_url[0]
            yield Request(next_url, headers=self.headers)
 def parse_item(self, response):
     # hxs=HtmlXPathSelector(response)
     # sel=Selector(response)
     item = DoubanMovieItem()
     # items.py定义的内容
     # url=Field()
     # ID=Field()
     # name=Field()
     # director=Field()
     # writer=Field()
     # role=Field()
     # types=Field()
     # summary=Field()
     item['url'] = re.match(
         string=''.join(response.url),
         pattern='(https://movie.douban.com/subject/\d+)/.*').group(1)
     item['movieid'] = item['url'].split('/')[-1]
     item['ID'] = '/'.join(
         response.xpath('//*/a[contains(@href,"subject")]/@href').re(
             'movie.douban.com/subject/(\d+)/(?:\?from|$)'))
     # item['ID']=''.join(response.xpath('//*[@id="content"]/div/div[1]/div[1]/div[3]/ul/li[5]/span/@id').extract())
     item['name'] = ''.join(
         response.xpath('//*[@id="content"]/h1/span[1]/text()').extract())
     item['director'] = '/'.join(
         response.xpath(
             '//*[@id="info"]/span[1]/span[2]/a/text()').extract())
     item['writer'] = '/'.join(
         response.xpath(
             '//*[@id="info"]/span[2]/span[2]/a/text()').extract())
     item['role'] = '/'.join(
         response.xpath(
             '//*[@id="info"]/span[3]/span[2]/a/text()').extract())
     item['types'] = '/'.join(
         response.xpath('//span[@property="v:genre"]/text()').extract())
     item['summary'] = ''.join(
         response.xpath('//span[@property="v:summary"]/text()').extract())
     item['summary'] = item['summary'].strip().\
         replace('<br />', '').replace('\t', ' ').\
         replace('\n', ' ').replace('&amp', '').replace('&quot;','').replace(u'\u3000', '')
     item['summary'] = re.sub(r' {1,}', ' ', item['summary'])
     if self.count == self.MAX_MOVIE:
         while True:
             print 'You have got {0} movies, please quit!'.format(
                 self.MAX_MOVIE)
             time.sleep(2)
     self.count += 1
     yield item
Example #25
0
    def parse(self, response):
        print(response.text)
        item = DoubanMovieItem()
        selector = response.xpath('//div[@class="list-wp"]/a')

        item['name'] = selector.xpath('p/span[@class="title"]/text()').getall()
        item['score'] = selector.xpath('p/span[@class="rate"]/text()').getall()
        item['pic'] = selector.xpath('div/span/img/@src').getall()
        item['link'] = selector.xpath('@href').getall()

        yield item

        print(type(item['link']), len(item['link']))
        for link in item['link']:
            yield response.follow(link,
                                  callback=self.next_parse,
                                  meta={'depth': 1})
        '''
Example #26
0
    def parse(self, response):

        movie_list = response.xpath('//ol[@class="grid_view"]//li')
        for movie in movie_list:
            item = DoubanMovieItem()
            item['movie_pic'] = movie.xpath(
                './/div[@class="item"]//div[@class="pic"]//img/@src'
            ).extract_first()
            item['movie_title'] = movie.xpath(
                './/div[@class="item"]//div[@class="hd"]//a//span[@class="title"]/text()'
            ).extract_first()

            span2 = movie.xpath(
                './/div[@class="item"]//div[@class="hd"]//a//span[2]/text()'
            ).extract_first()
            span3 = movie.xpath(
                './/div[@class="item"]//div[@class="hd"]//a//span[3]/text()'
            ).extract_first()
            item['movie_other'] = ((span2 if span2 else '') +
                                   (span3 if span3 else '')).replace(
                                       u'\xa0', u' ')

            item['movie_introduce'] = movie.xpath(
                'normalize-space(.//div[@class="item"]//div[@class="bd"]//p[1]/text())'
            ).extract_first().replace(u'\xa0', u' ')
            item['movie_star'] = movie.xpath(
                './/div[@class="bd"]//div[@class="star"]/span[2]/text()'
            ).extract_first()
            item['evaluate_num'] = movie.xpath(
                './/div[@class="bd"]//div[@class="star"]/span[4]/text()'
            ).extract_first()
            item['movie_description'] = movie.xpath(
                './/div[@class="bd"]//p[@class="quote"]/span/text()'
            ).extract_first()

            yield item

        next_url = response.xpath(
            '//span[@class="next"]/a/@href').extract_first()
        if next_url:
            print(self.start_urls[0] + next_url)
            yield scrapy.Request(self.start_urls[0] + next_url,
                                 callback=self.parse)
Example #27
0
 def parse(self, response):
     item = DoubanMovieItem()
     film_list = json.loads(response.body.decode())
     if film_list == [] or self.offset > 1000:
         return
     for film in film_list['data']:
         item['film_name'] = film['title']
         item['film_directors'] = film['directors']
         item['film_rate'] = film['rate']
         item['film_actors'] = film['casts']
         item['film_image_url'] = film['cover']
         urllib.request.urlretrieve(
             item['film_image_url'],
             self.file_path + "/" + item['film_name'] + "." +
             item['film_image_url'].split(".")[-1])
         yield item
     self.offset = self.offset + 20
     new_url = 'https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=1&start=' + str(
         self.offset)
     yield scrapy.Request(url=new_url, callback=self.parse)
Example #28
0
 def parse(self, response):
     movie_blocks = response.xpath('//ol[@class="grid_view"]/li')
     for block in movie_blocks:
         name = block.css('span.title::text').extract_first()
         # name = block.xpath()
         star = block.xpath(
             ".//span[@class='rating_num']/text()").extract_first()
         e = block.xpath(
             ".//div[@class='star']/span[4]/text()").extract_first()
         evaluation = self.eval_re.search(e).group()
         #group()和groups(),前一个输出字符串,后一个输出元组
         introduction = block.css('span.inq::text').extract_first()
         item = DoubanMovieItem()
         item['name'] = name
         item['star'] = star
         item['evaluation'] = evaluation
         item['introduction'] = introduction
         yield item  #下一次迭代从yield之后开始
     next_url = response.css('span.next > a::attr(href)').extract_first()
     # 注意地址取法,之前都是去text,而这里取的是<>里的,指定attr和对应名称。
     # <a href="?start=25&amp;filter=" style="background: rgb(204, 136, 136); border: 2px solid red;">后页&gt;</a>
     if next_url:
         next_url = response.urljoin(next_url)  # ruljoin创建完整的url
         yield scrapy.Request(next_url, callback=self.parse)
Example #29
0
    def parse_item(self, response):
        global failed_count
        global real_parse_count
        item = DoubanMovieItem()
        try:
            real_parse_count += 1
            print("real parse count = %d" % (real_parse_count))
            # get movie id
            url = response.url
            id = url.split('/')[-2].strip()
            item["movie_id"] = id

            # get movie name
            name = response.xpath(
                '//div[@id="content"]/h1/span[1]/text()').extract_first()
            item["movie_name"] = name.strip() if name else ""

            #get movie year
            year = response.xpath(
                '//div[@id="content"]/h1/span[2]/text()').extract_first()
            item["movie_year"] = year.strip("()() ") if year else ""

            # get movie rate
            rate = response.xpath(
                "//div[@class='rating_self clearfix']/strong/text()"
            ).extract_first()
            item["movie_rate"] = float(rate.strip() if rate else "-1")

            # get movie rate people
            rate_num = response.xpath(
                "//span[@property='v:votes']/text()").extract_first()
            item["movie_rate_people"] = int(
                rate_num.strip() if rate_num else "-1")

            # get hot short comments
            comments = response.xpath(
                "//div[@id='hot-comments']//div[@class='comment-item']//div[@class='comment']/p/text()"
            ).extract()
            votes = response.xpath(
                "//div[@id='hot-comments']//div[@class='comment-item']//div[@class='comment']//span[@class='votes pr5']/text()"
            ).extract()
            rates = response.xpath(
                "//div[@id='hot-comments']//div[@class='comment-item']//span[@class='comment-info']/span[1]/@title"
            ).extract()
            if len(comments) == len(votes) and len(votes) == len(rates):
                commentsarray = []
                for i in range(len(votes)):
                    short_comments = {}
                    short_comments['comment'] = comments[i]
                    short_comments['votes'] = int(votes[i])
                    short_comments['rates'] = rates[i]
                    commentsarray.append(short_comments)
                item["movie_hot_short_comments"] = commentsarray

            seenwish = response.xpath(
                "//div[@class='subject-others-interests-ft']//a//text()"
            ).extract()
            if seenwish and len(seenwish) == 2:
                item['movie_seen'] = int(seenwish[0][:-3])
                item['movie_wishes'] = int(seenwish[1][:-3])

            # get movie info
            info = response.xpath("//div[@id='info']")
            infoarray = info.extract()
            infostr = ''.join(infoarray).strip()

            director = info.xpath("span[1]/span[2]/a/text()").extract()
            self.add_array("movie_director", director, item)

            writor = info.xpath("span[2]/span[2]/a/text()").extract()
            self.add_array("movie_writor", writor, item)

            actors = info.xpath("span[3]/span[2]/a/text()").extract()
            self.add_array("movie_actors", actors, item)

            time = info.xpath(
                "span[@property='v:runtime']/@content").extract_first()
            item["movie_time"] = float(time.strip() if time else "-1")

            types = info.xpath("span[@property='v:genre']/text()").extract()
            self.add_array("movie_type", types, item)

            try:
                lang = re.search(language_pattern, infostr)
                if lang:
                    language = lang.group(1).strip()
                    item["movie_language"] = language.strip()
            except:
                pass

            try:
                regionmatch = re.search(region_pattern, infostr)
                if regionmatch:
                    region = regionmatch.group(1).strip()
                    item["movie_region"] = region.strip()
            except:
                pass

            try:
                dialectmatch = re.search(dialect_pattern, infostr)
                if dialectmatch:
                    dialect = dialectmatch.group(1).strip()
                    item["movie_dialect"] = dialect.strip()
            except:
                pass

            desc = response.xpath("//span[@property='v:summary']/node()"
                                  ).extract_first().strip()
            item["movie_desc"] = desc.strip() if desc else ""

            tags = response.xpath(
                "//div[@class='tags-body']/a/text()").extract()
            self.add_array("movie_tags", tags, item)

            pic = response.xpath(
                "//div[@id='mainpic']/a/img/@src").extract_first()
            item["movie_pic_url"] = pic

            yield item

        except Exception, e:
            # do nothing
            logging.info("Parse error:%s" % (str(e)))
            print("failed_count = %d" % (failed_count + 1))
            failed_count += 1
            pass
Example #30
0
    def parse_content(self, response):
        movieid = self.movie[0]
        tag = self.movie[1]
        title = self.movie[2]
        director = self.movie[3]
        actor = self.movie[4]
        rate = self.movie[5]
        star = self.movie[6]
        cover = self.movie[7]
        html = BeautifulSoup(response.body, 'lxml')
        info = html.select('#info')
        if len(info) == 0:
            print(response.text)
            return [-2]
        info = html.select('#info')[0].get_text().split('\n')
        print(info)
        # print(len(info))
        category = ''
        district = ''
        showtime = ''
        length = ''
        for item in info:
            item = item.split(':')
            if item[0] == '类型':
                category = item[-1].strip()
            elif item[0] == '制片国家/地区':
                district = item[-1].strip()
            elif item[0] == '上映日期':
                showtime = item[-1].strip().split('-')[0]
            elif item[0] == '片长':
                length = item[-1].strip()
                length = re.findall('\d+', length)[0]

        category = category.replace(r'/', ',')
        if len(district) > 0:
            district = district[:50]

        if len(category) > 0:
            category = category[:30]
        rate_count = html.select(
            '#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > div > div.rating_sum > a > span'
        )[0].get_text()

        # interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-child(1) > span.rating_per
        rate5 = html.select(
            '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(1) > span.rating_per'
        )[0].get_text().split('%')[0]
        rate4 = html.select(
            '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(2) > span.rating_per'
        )[0].get_text().split('%')[0]
        rate3 = html.select(
            '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(3) > span.rating_per'
        )[0].get_text().split('%')[0]
        rate2 = html.select(
            '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(4) > span.rating_per'
        )[0].get_text().split('%')[0]
        rate1 = html.select(
            '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(5) > span.rating_per'
        )[0].get_text().split('%')[0]

        item = DoubanMovieItem()
        item['movieid'] = movieid
        item['title'] = title
        item['tag'] = tag
        item['directors'] = director
        item['actors'] = actor
        item['showtime'] = showtime
        item['length'] = length
        item['district'] = district
        item['category'] = category
        item['star'] = star
        item['rate'] = rate
        item['rate_count'] = rate_count
        item['rate5'] = rate5
        item['rate4'] = rate4
        item['rate3'] = rate3
        item['rate2'] = rate2
        item['rate1'] = rate1
        item['cover'] = cover
        print('###### ')
        print(item)
        print('######')