class DoubanMovieParser(Parser): def __init__(self, opener=None, url=None, bundle=None, **kwargs): super(DoubanMovieParser, self).__init__(opener=opener, url=url, **kwargs) if self.opener is None: self.opener = MechanizeOpener() self.url = url self.opener.set_default_timeout(TIMEOUT) if not hasattr(self, 'logger') or self.logger is None: self.logger = get_logger(name='douban_parser') def get_subject_id(self, url): """ extract subject id from url """ id_arr = re.findall('https://movie.douban.com/subject/(\d+)', url) if id_arr: return id_arr[0] def _check_url(self, dest_url, src_url): """ check whether url are same domain path """ return dest_url.split('?')[0] == src_url.split('?')[0] def check(self, url, br): dest_url = br.geturl() if not self._check_url(dest_url, url): if dest_url.startswith('http://douban.com/login.php'): raise DoubanLoginFailure('Douban not login or login expired') return True def get_movie_subject(self, sid): try: movie = getattr(DoubanMovie, 'objects').get(sid=sid) except DoesNotExist: movie = DoubanMovie(sid=sid) movie.save() return movie def parse(self, url=None): url = url or self.url sid = self.get_subject_id(url) movie = self.get_movie_subject(sid) print(datetime.utcnow()) # if entry has updated in latest 24 hours, skip this url if movie.last_update and abs( (datetime.utcnow() - movie.last_update).days) > 1: self.logger.warn('Skip vistied url: %s' % url) return self.logger.debug('proxy:{}'.format(self.opener.proxies)) try: br = self.opener.browse_open(url) except URLError: raise FetchBannedError() if not self.check(url, br): return html = br.response().read() if html == None: raise FetchBannedError() soup = beautiful_soup(html) if re.compile('<span class="pl">集数:</span>').findall(html): subtype = 't' else: subtype = 'm' try: title = soup.select( "span[property='v:itemreviewed']")[0].text.strip() except: raise FetchBannedError() year_tags = soup.select("div#content > h1 span.year") if year_tags: year = year_tags[0].text[1:-1] else: year = None # self.logger.debug(title) summary_tags = soup.select("span[property='v:summary']") summary = summary_tags[0].text.strip() if summary_tags else '' # tags tag_tags = soup.select('div .tags-body a') tags = [t.text for t in tag_tags] # get directors director_tags = soup.select('div #info > span a[rel="v:directedBy"]') p1 = re.compile(r'<[^>]+>(?P<director>[^<]+)</a>') directors = [p1.match(str(t)).group('director') for t in director_tags] # get stars star_tags = soup.select('div #info > span a[rel="v:starring"]') p2 = re.compile(r'<[^>]+>(?P<star>[^<]+)</a>') casts = [p2.match(str(t)).group('star') for t in star_tags] # get writers writers_tags = soup.select('div #info > span')[1].select('a') p2 = re.compile(r'<[^>]+>(?P<writer>[^<]+)</a>') writers = [p2.match(str(t)).group('writer') for t in writers_tags] # get genre genre_tags = soup.select('div #info > span[property="v:genre"]') p3 = re.compile(r'<span property="v:genre">(?P<genre>[^<]+)</span>') genres = [p3.match(str(t)).group('genre') for t in genre_tags] # get release date pubdate_tag = soup.select( 'div #info > span[property="v:initialReleaseDate"]') f4 = 0 if pubdate_tag: p41 = re.compile( r'<[^>]+>(?P<pubdate>[^(]+)[(]中国大陆([ ]3D)*[)]<[^>]+>') p42 = re.compile( r'<[^>]+>(?P<pubdate>[^(]+)[(]中国内地([ ]3D)*[)]<[^>]+>') p43 = re.compile( r'<[^>]+>(?P<pubdate>[^(]+)[(]香港([ ]3D)*[)]<[^>]+>') p44 = re.compile(r'[0-9-]+') for t in pubdate_tag: m = p41.search(str(t)) if m != None: f4 = 1 pubdate = m.group('pubdate') break m = p42.search(str(t)) if m != None: f4 = 1 pubdate = m.group('pubdate') break m = p43.search(str(t)) if m != None: f4 = 1 pubdate = m.group('pubdate') break m = p44.search(str(t)) if m != None: f4 = 1 pubdate = m.group() break if f4 == 0: self.logger.critical('{0} has no pubdate'.format(sid)) pubdate = year # append month/date if just year is known if len(pubdate) == 4: pubdate = pubdate + "-6-30" elif len(pubdate) == 7: pubdate = pubdate + "-15" pubdate = datetime.strptime(pubdate, '%Y-%m-%d') if not year: year = pubdate.strftime('%Y') # get wishes wishes_tags = soup.select( 'div #subject-others-interests > .subject-others-interests-ft > a') #print wishes_tags if len(wishes_tags) == 0: self.logger.critical('{0} donnot have wish count'.format(sid)) wish_count = None collect_count = None for i in range(len(wishes_tags)): m = re.match(u'(?P<wishes>[0-9]+)人想看', wishes_tags[i].text) if m: wish_count = m.group('wishes') continue m = re.match(u'(?P<collections>[0-9]+)人看过', wishes_tags[i].text) if m: collect_count = m.group('collections') rating_num = soup.select(r'strong.rating_num')[0].text if not rating_num: rating_num = None rating_lvls = soup.select(r'div.ratings-on-weight span.rating_per') if rating_lvls: rating_lvls = [float(r.text[:-1]) for r in rating_lvls] # season season_tags = soup.select('div #info select#season]') if season_tags: movie.seasons_count = season_tags.count movie.current_season = season_tags[0].select( 'option[selected]')[0].text photo_url = soup.select('a[class="nbgnbg"] img')[0].attrs['src'] #region save movie def parseNumber(v): m = re.findall('(\d+).*', v) if m: return int(m[0]) else: # parse chinese return convert(v.strip()) info_map = { u'制片国家/地区': { 'field': 'countries' }, u'语言': { 'field': 'languages' }, u'集数': { 'field': 'episodes_count', 'func': parseNumber }, u'单集片长': { 'field': 'duration', 'func': parseNumber }, u'片长': { 'field': 'duration', 'func': parseNumber }, u'又名': { 'field': 'aka', 'func': lambda v: v.split('/') }, u'IMDb链接': { 'field': 'imdb_id' } } info_str = soup.select('div #info')[0].text for k, f in info_map.items(): v = re.findall(k + "\:(.*)", info_str, re.MULTILINE) if v: func = (lambda s: s.strip()) \ if 'func' not in f \ else f['func'] f_val = func(v[0].strip()) setattr(movie, f['field'], f_val) movie.sid = sid movie.title = title movie.photo_alt = photo_url movie.year = year movie.summary = summary movie.tags = tags movie.subtype = subtype movie.directors = directors movie.casts = casts movie.writers = writers if rating_num: movie.rating = float(rating_num) if rating_lvls: movie.high_rating_pct = rating_lvls[0] + rating_lvls[1] movie.low_rating_pct = rating_lvls[3] + rating_lvls[4] if wish_count: movie.wish_count = wish_count if collect_count: movie.collect_count = collect_count movie.pubdate = pubdate movie.genres = genres movie.alt = url movie.last_update = datetime.now() movie.save() def _is_same(out_url, url): return out_url.rsplit('#', 1)[0] == url next_urls = soup.select("div.recommendations-bd a") for link in next_urls: out_url = link.attrs['href'] if not _is_same(out_url, url) and out_url.startswith( "https://movie.douban.com/subject"): sid_next = self.get_subject_id(out_url) if sid_next != sid: yield out_url