コード例 #1
0
    def get_viewed_films(self,user_url):
        r=proxy.gethtml(url=user_url+'/collect?start=0',headers=self.headers,params={})
        if r is None:
            return
        soup=BeautifulSoup(r.content.decode(),'html.parser')
        if soup is None:
            return

        movien=soup.find('div',{'class':'info'})
        tot=0
        if movien:
            sf=movien.find('h1')
            if sf:
                a=sf.get_text()
                ar=re.search(r'\((.+)\)',a)
                self.viewed_count=ar.group(1)
                tot=int(self.viewed_count)
        
        if tot>util.USER_FILM_MAX:
            tot=util.USER_FILM_MAX

        for i in range(0,int(tot/15)+1):
            r=proxy.gethtml(url=user_url+'/collect?start='+str(i*15),headers=util.headers,params={})
            if r is None:
                continue
            soup=BeautifulSoup(r.content.decode(),'html.parser')
            if soup is None:
                continue
            items=soup.find_all('div',{'class':'item'})
            if items is None:
                continue
            for item in items:
                a=item.find('a')
                if a is None:
                    continue
                film_id_ref=a.get('href')
                refr=re.search(r'(\d+)\/?$',film_id_ref)
                if refr:
                    film_id=refr.group(1)
                    try:
                        flms=sql.get_film_byid(film_id)
                        if len(flms)==0:
                            f=film.film()
                            f.film_id=film_id
                            f.save()
                    except Exception as e:
                        log.logger.info(str(e))
コード例 #2
0
ファイル: film.py プロジェクト: sspkumdp/doubanfilmspider
    def get_actors(self,film_id):
        r=proxy.gethtml(url='https://movie.douban.com/subject/'+str(film_id)+'/celebrities',headers=util.headers,params={})
        if r is None:
            return
        soup=BeautifulSoup(r.content.decode(),'html.parser')
        if soup is None:
            return
        
        lis=soup.find_all('li',{'class':'celebrity'})
        if lis is None:
            return
        for li in lis:
            act=actor.actor()
            sf=li.find('span',{'class':'name'})
            if sf:
                sfa=sf.find('a')
                if sfa:
                    act.actor_name=sfa.get_text()
                    ar=re.search(r'(\d+)\/?$',sfa.get('href'))
                    if ar:
                        act.actor_id=ar.group(1)
            if act.actor_id=="":
                continue
            
            sf=li.find('span',{'class':'role'})
            if sf:
                act.actor_role=sf.get_text()
            
            sf=li.find('span',{'class':'works'})
            if sf:
                sfa=sf.find_all('a')
                if sfa:
                    act.main_works=util.listtostr(sfa,'/')


            dbacts=sql.get_actor_byid(act.actor_id)
            try:
                if len(dbacts)==0:
                    self.get_actor_info(act)
                    sql.save_actor(act)
                dbactfs=sql.get_actor_film_byid(act.actor_id,film_id)
                if len(dbactfs)==0:
                    sql.save_actor_film(act.actor_id,film_id,act.actor_role)
            except Exception as e:
                log.logger.info(str(e))
            print(act)
コード例 #3
0
 def get_user_info(self,user_url):
     r=proxy.gethtml(url=user_url,headers=self.headers,params={})
     if r is None:
         self.visible='0'
         return
     soup=BeautifulSoup(r.content.decode(),'html.parser')
     if soup is None:
         self.visible='0'
         return
     ui=soup.find('div',{'class':'user-info'})
     if ui:
         self.visible='1'
         uia=ui.find('a')
         if uia:
             self.user_area=uia.get_text()
         uid=ui.find('div',{'class':'pl'})
         if uid:
             uids=util.content_to_str(uid,' ')
             uidsr=re.search(r'(\d\d\d\d-\d\d-\d\d)',uids)
             if uidsr:
                 self.user_time=uidsr.group(1)
         uii=soup.find('span',id='intro_display')
         if uii:
             self.user_info=util.content_to_str(uii,' ')
     else:
         self.visible='0'
     
     mv=soup.find('div',id='movie')
     if mv:
         mvs=mv.find('span',{'class':'pl'})
         if mvs:
             mvsas=mvs.find_all('a')
             for mvsa in mvsas:
                 if "部想看" in mvsa.get_text():
                     self.wish_count=mvsa.get_text().replace('部想看','')
                 elif "部看过" in mvsa.get_text():
                     self.viewed_count=mvsa.get_text().replace('部看过','')
コード例 #4
0
    def get_comments_by_film(self, film_id):
        params = {
            'start': '0',
            'limit': '20',
            'status': 'P',
            'sort': 'new_score'
        }
        r = proxy.gethtml(
            'https://movie.douban.com/subject/' + str(film_id) + '/comments',
            self.headers, params)
        if r is None:
            return
        soup = BeautifulSoup(r.content.decode(), 'html.parser')
        if soup is None:
            return

        tot = 0
        cmt_tab = soup.find('ul', {'class': 'fleft CommentTabs'})
        if cmt_tab:
            cmt_tab_span = cmt_tab.find('span')
            if cmt_tab_span:
                txtr = re.search(r'(\d+)', cmt_tab_span.get_text())
                if txtr:
                    tot = int(txtr.group(1))
        #限制一下条数防止爬的太多爬不完
        if tot > util.COMMENT_MAX:
            tot = util.COMMENT_MAX

        for i in range(0, int(tot / 20) + 1):
            params = {
                'start': str(i * 20),
                'limit': '20',
                'status': 'P',
                'sort': 'new_score'
            }
            r = proxy.gethtml(url='https://movie.douban.com/subject/' +
                              str(film_id) + '/comments',
                              params=params,
                              headers=self.headers)
            if r is None:
                continue
            soup = BeautifulSoup(r.content.decode(), 'html.parser')
            if soup is None:
                continue

            cmts = soup.find_all('div', attrs={'class': 'comment-item'})
            if cmts is None:
                continue
            for cmt in cmts:
                c = comment.comment()
                c.comment_id = cmt.get('data-cid')
                #<span class="votes vote-count">1042</span>

                sf = cmt.find('span', attrs={'class': 'votes vote-count'})
                if sf:
                    c.comment_useful = sf.get_text()
                ci = cmt.find('span', {'class': 'comment-info'})
                if ci:
                    un = ci.find('a')
                    if un:
                        c.user_name = un.get_text()
                        c.user_url = un.get('href')

                urla = re.sub(r'\/$', '', c.user_url).split('/')
                if len(urla) > 1:
                    c.user_id = urla[-1]

                dbusers = sql.get_user_byid(c.user_id)
                if len(dbusers) == 0:
                    new_user = user.user()
                    new_user.user_id = c.user_id
                    new_user.user_name = c.user_name
                    new_user.user_url = c.user_url

                    new_user.get_user_info(new_user.user_url)
                    try:
                        dbusers = sql.get_user_byid(new_user.user_id)
                        if len(dbusers) == 0:
                            sql.save_user(new_user)
                        if new_user.visible == '0':
                            sql.update_user_spider(new_user.user_id)
                    except Exception as e:
                        log.logger.info(str(e))

                #<span title="力荐" class="allstar50 rating"></span>
                if ci:
                    star = ci.find('span', {'class': re.compile('allstar')})
                    if star:
                        c.star = str(
                            int(star.get('class')[0].replace('allstar', '')) /
                            10)

                    sf = ci.find('span', {'class': 'comment-time'})
                    if sf:
                        c.comment_time = sf.get_text().strip()

                sf = cmt.find('p', {'class': 'comment-content'})
                if sf:
                    sfs = sf.find('span')
                    if sfs:
                        c.comment_content = sfs.get_text()

                c.film_id = film_id
                try:
                    dbcmts = sql.get_comment_byid(c.comment_id)
                    if len(dbcmts) == 0:
                        sql.save_comment(c)
                except Exception as e:
                    log.logger.info("cid:" + str(c.comment_id))
                    log.logger.info(str(e))
コード例 #5
0
ファイル: film.py プロジェクト: sspkumdp/doubanfilmspider
    def get_film(self,film_id):
        r=proxy.gethtml(url='https://movie.douban.com/subject/'+str(film_id),headers=util.headers,params={})
        if r is None:
            return
        soup=BeautifulSoup(r.content.decode(),'html.parser')
        if r is None:
            return
        
        self.film_id=str(film_id)
        lst=soup.find_all('span',attrs={'property':'v:itemreviewed'})
        if lst:
            self.film_name=util.listtostr(lst,'')

        lst=soup.find_all('a',attrs={'rel':'v:directedBy'})
        if lst:
            self.director=util.listtostr(lst,"/")
        
        sw=soup.find('span',attrs={'class':'pl'},text='编剧')
        if sw:
            swn=sw.find_next_sibling('span')
            if swn:
                swna=swn.find_all('a')
                if swna:
                    self.screenwriter=util.listtostr(swna,'/')

        lst=soup.find_all('a',attrs={'rel':'v:starring'})
        if lst:
            self.mainactors=util.listtostr(lst,"/")
        
        lst=soup.find_all('span',attrs={'property':'v:genre'})
        if lst:
            self.film_type=util.listtostr(lst,'/')

        
        sw=soup.find('span',attrs={'class':'pl'},text='制片国家/地区:')
        if sw:
            swn=sw.next_sibling
            if swn:
                self.area=swn.strip()
        

        sw=soup.find('span',attrs={'class':'pl'},text='语言:')
        if sw:
            swn=sw.next_sibling
            if swn:
                self.lang=swn.strip()
        
        lst=soup.find_all('span',attrs={'property':'v:initialReleaseDate'})
        if lst:
            self.film_date=util.listtostr(lst,'/')

        lst=soup.find_all('span',attrs={'property':'v:runtime'})
        if lst:
            self.film_time=util.listtostr(lst,'/')


        
        sw=soup.find('span',attrs={'class':'pl'},text='又名:')
        if sw:
            swn=sw.next_sibling
            if swn:
                self.film_alias=swn.strip()

        
        sw=soup.find('span',attrs={'class':'pl'},text='IMDb链接:')
        if sw:
            swn=sw.next_sibling
            if swn:
                self.imdb=swn.strip()


        sf=soup.find('strong',attrs={'property':'v:average'})
        if sf:
            self.score=sf.get_text()

        sf=soup.find('span',attrs={'property':'v:summary'})
        if sf:
            self.film_summary=sf.get_text().strip()
コード例 #6
0
ファイル: film.py プロジェクト: sspkumdp/doubanfilmspider
    def get_actor_info(self,act):
        r=proxy.gethtml(url='https://movie.douban.com/celebrity/'+str(act.actor_id),headers=util.headers,params={})
        if r is None:
            return 
        soup=BeautifulSoup(r.content.decode(),'html.parser')
        if soup is None:
            return

        gt=soup.find('span',text='性别')
        if gt:
            sf=gt.next_sibling
            if sf:
                act.gender=sf.replace(':','').strip()
        
        gt=soup.find('span',text='星座')
        if gt:
            sf=gt.next_sibling
            if sf:
                act.xingzuo=sf.replace(':','').strip()
        
        gt=soup.find('span',text='出生日期')
        if gt:
            sf=gt.next_sibling
            if sf:
                act.birthday=sf.replace(':','').strip()

        gt=soup.find('span',text='出生地')
        if gt:
            sf=gt.next_sibling
            if sf:
                act.birtharea=sf.replace(':','').strip()
        
        gt=soup.find('span',text='职业')
        if gt:
            sf=gt.next_sibling
            if sf:
                act.occupation=sf.replace(':','').strip()

        gt=soup.find('span',text='更多中文名')
        if gt:
            sf=gt.next_sibling
            if sf:
                act.more_name=sf.replace(':','').strip()
        
        gt=soup.find('span',text='更多外文名')
        if gt:
            sf=gt.next_sibling
            if sf:
                act.more_foreign_name=sf.replace(':','').strip()
        
        gt=soup.find('span',text='imdb编号')
        if gt:
            gta=gt.find_next_sibling('a')
            if gta:
                act.imdb_id=gta.get_text()
        
        gt=soup.find('span',text='官方网站')
        if gt:
            gta=gt.find_next_sibling('a')
            if gta:
                act.web_url=gta.get_text()