def get_viewed_films(self,user_url): r=proxy.gethtml(url=user_url+'/collect?start=0',headers=self.headers,params={}) if r is None: return soup=BeautifulSoup(r.content.decode(),'html.parser') if soup is None: return movien=soup.find('div',{'class':'info'}) tot=0 if movien: sf=movien.find('h1') if sf: a=sf.get_text() ar=re.search(r'\((.+)\)',a) self.viewed_count=ar.group(1) tot=int(self.viewed_count) if tot>util.USER_FILM_MAX: tot=util.USER_FILM_MAX for i in range(0,int(tot/15)+1): r=proxy.gethtml(url=user_url+'/collect?start='+str(i*15),headers=util.headers,params={}) if r is None: continue soup=BeautifulSoup(r.content.decode(),'html.parser') if soup is None: continue items=soup.find_all('div',{'class':'item'}) if items is None: continue for item in items: a=item.find('a') if a is None: continue film_id_ref=a.get('href') refr=re.search(r'(\d+)\/?$',film_id_ref) if refr: film_id=refr.group(1) try: flms=sql.get_film_byid(film_id) if len(flms)==0: f=film.film() f.film_id=film_id f.save() except Exception as e: log.logger.info(str(e))
def get_actors(self,film_id): r=proxy.gethtml(url='https://movie.douban.com/subject/'+str(film_id)+'/celebrities',headers=util.headers,params={}) if r is None: return soup=BeautifulSoup(r.content.decode(),'html.parser') if soup is None: return lis=soup.find_all('li',{'class':'celebrity'}) if lis is None: return for li in lis: act=actor.actor() sf=li.find('span',{'class':'name'}) if sf: sfa=sf.find('a') if sfa: act.actor_name=sfa.get_text() ar=re.search(r'(\d+)\/?$',sfa.get('href')) if ar: act.actor_id=ar.group(1) if act.actor_id=="": continue sf=li.find('span',{'class':'role'}) if sf: act.actor_role=sf.get_text() sf=li.find('span',{'class':'works'}) if sf: sfa=sf.find_all('a') if sfa: act.main_works=util.listtostr(sfa,'/') dbacts=sql.get_actor_byid(act.actor_id) try: if len(dbacts)==0: self.get_actor_info(act) sql.save_actor(act) dbactfs=sql.get_actor_film_byid(act.actor_id,film_id) if len(dbactfs)==0: sql.save_actor_film(act.actor_id,film_id,act.actor_role) except Exception as e: log.logger.info(str(e)) print(act)
def get_user_info(self,user_url): r=proxy.gethtml(url=user_url,headers=self.headers,params={}) if r is None: self.visible='0' return soup=BeautifulSoup(r.content.decode(),'html.parser') if soup is None: self.visible='0' return ui=soup.find('div',{'class':'user-info'}) if ui: self.visible='1' uia=ui.find('a') if uia: self.user_area=uia.get_text() uid=ui.find('div',{'class':'pl'}) if uid: uids=util.content_to_str(uid,' ') uidsr=re.search(r'(\d\d\d\d-\d\d-\d\d)',uids) if uidsr: self.user_time=uidsr.group(1) uii=soup.find('span',id='intro_display') if uii: self.user_info=util.content_to_str(uii,' ') else: self.visible='0' mv=soup.find('div',id='movie') if mv: mvs=mv.find('span',{'class':'pl'}) if mvs: mvsas=mvs.find_all('a') for mvsa in mvsas: if "部想看" in mvsa.get_text(): self.wish_count=mvsa.get_text().replace('部想看','') elif "部看过" in mvsa.get_text(): self.viewed_count=mvsa.get_text().replace('部看过','')
def get_comments_by_film(self, film_id): params = { 'start': '0', 'limit': '20', 'status': 'P', 'sort': 'new_score' } r = proxy.gethtml( 'https://movie.douban.com/subject/' + str(film_id) + '/comments', self.headers, params) if r is None: return soup = BeautifulSoup(r.content.decode(), 'html.parser') if soup is None: return tot = 0 cmt_tab = soup.find('ul', {'class': 'fleft CommentTabs'}) if cmt_tab: cmt_tab_span = cmt_tab.find('span') if cmt_tab_span: txtr = re.search(r'(\d+)', cmt_tab_span.get_text()) if txtr: tot = int(txtr.group(1)) #限制一下条数防止爬的太多爬不完 if tot > util.COMMENT_MAX: tot = util.COMMENT_MAX for i in range(0, int(tot / 20) + 1): params = { 'start': str(i * 20), 'limit': '20', 'status': 'P', 'sort': 'new_score' } r = proxy.gethtml(url='https://movie.douban.com/subject/' + str(film_id) + '/comments', params=params, headers=self.headers) if r is None: continue soup = BeautifulSoup(r.content.decode(), 'html.parser') if soup is None: continue cmts = soup.find_all('div', attrs={'class': 'comment-item'}) if cmts is None: continue for cmt in cmts: c = comment.comment() c.comment_id = cmt.get('data-cid') #<span class="votes vote-count">1042</span> sf = cmt.find('span', attrs={'class': 'votes vote-count'}) if sf: c.comment_useful = sf.get_text() ci = cmt.find('span', {'class': 'comment-info'}) if ci: un = ci.find('a') if un: c.user_name = un.get_text() c.user_url = un.get('href') urla = re.sub(r'\/$', '', c.user_url).split('/') if len(urla) > 1: c.user_id = urla[-1] dbusers = sql.get_user_byid(c.user_id) if len(dbusers) == 0: new_user = user.user() new_user.user_id = c.user_id new_user.user_name = c.user_name new_user.user_url = c.user_url new_user.get_user_info(new_user.user_url) try: dbusers = sql.get_user_byid(new_user.user_id) if len(dbusers) == 0: sql.save_user(new_user) if new_user.visible == '0': sql.update_user_spider(new_user.user_id) except Exception as e: log.logger.info(str(e)) #<span title="力荐" class="allstar50 rating"></span> if ci: star = ci.find('span', {'class': re.compile('allstar')}) if star: c.star = str( int(star.get('class')[0].replace('allstar', '')) / 10) sf = ci.find('span', {'class': 'comment-time'}) if sf: c.comment_time = sf.get_text().strip() sf = cmt.find('p', {'class': 'comment-content'}) if sf: sfs = sf.find('span') if sfs: c.comment_content = sfs.get_text() c.film_id = film_id try: dbcmts = sql.get_comment_byid(c.comment_id) if len(dbcmts) == 0: sql.save_comment(c) except Exception as e: log.logger.info("cid:" + str(c.comment_id)) log.logger.info(str(e))
def get_film(self,film_id): r=proxy.gethtml(url='https://movie.douban.com/subject/'+str(film_id),headers=util.headers,params={}) if r is None: return soup=BeautifulSoup(r.content.decode(),'html.parser') if r is None: return self.film_id=str(film_id) lst=soup.find_all('span',attrs={'property':'v:itemreviewed'}) if lst: self.film_name=util.listtostr(lst,'') lst=soup.find_all('a',attrs={'rel':'v:directedBy'}) if lst: self.director=util.listtostr(lst,"/") sw=soup.find('span',attrs={'class':'pl'},text='编剧') if sw: swn=sw.find_next_sibling('span') if swn: swna=swn.find_all('a') if swna: self.screenwriter=util.listtostr(swna,'/') lst=soup.find_all('a',attrs={'rel':'v:starring'}) if lst: self.mainactors=util.listtostr(lst,"/") lst=soup.find_all('span',attrs={'property':'v:genre'}) if lst: self.film_type=util.listtostr(lst,'/') sw=soup.find('span',attrs={'class':'pl'},text='制片国家/地区:') if sw: swn=sw.next_sibling if swn: self.area=swn.strip() sw=soup.find('span',attrs={'class':'pl'},text='语言:') if sw: swn=sw.next_sibling if swn: self.lang=swn.strip() lst=soup.find_all('span',attrs={'property':'v:initialReleaseDate'}) if lst: self.film_date=util.listtostr(lst,'/') lst=soup.find_all('span',attrs={'property':'v:runtime'}) if lst: self.film_time=util.listtostr(lst,'/') sw=soup.find('span',attrs={'class':'pl'},text='又名:') if sw: swn=sw.next_sibling if swn: self.film_alias=swn.strip() sw=soup.find('span',attrs={'class':'pl'},text='IMDb链接:') if sw: swn=sw.next_sibling if swn: self.imdb=swn.strip() sf=soup.find('strong',attrs={'property':'v:average'}) if sf: self.score=sf.get_text() sf=soup.find('span',attrs={'property':'v:summary'}) if sf: self.film_summary=sf.get_text().strip()
def get_actor_info(self,act): r=proxy.gethtml(url='https://movie.douban.com/celebrity/'+str(act.actor_id),headers=util.headers,params={}) if r is None: return soup=BeautifulSoup(r.content.decode(),'html.parser') if soup is None: return gt=soup.find('span',text='性别') if gt: sf=gt.next_sibling if sf: act.gender=sf.replace(':','').strip() gt=soup.find('span',text='星座') if gt: sf=gt.next_sibling if sf: act.xingzuo=sf.replace(':','').strip() gt=soup.find('span',text='出生日期') if gt: sf=gt.next_sibling if sf: act.birthday=sf.replace(':','').strip() gt=soup.find('span',text='出生地') if gt: sf=gt.next_sibling if sf: act.birtharea=sf.replace(':','').strip() gt=soup.find('span',text='职业') if gt: sf=gt.next_sibling if sf: act.occupation=sf.replace(':','').strip() gt=soup.find('span',text='更多中文名') if gt: sf=gt.next_sibling if sf: act.more_name=sf.replace(':','').strip() gt=soup.find('span',text='更多外文名') if gt: sf=gt.next_sibling if sf: act.more_foreign_name=sf.replace(':','').strip() gt=soup.find('span',text='imdb编号') if gt: gta=gt.find_next_sibling('a') if gta: act.imdb_id=gta.get_text() gt=soup.find('span',text='官方网站') if gt: gta=gt.find_next_sibling('a') if gta: act.web_url=gta.get_text()