コード例 #1
0
ファイル: parsers.py プロジェクト: Ganer/cola
 def parse(self, url=None):
     if self.bundle.exists == False:
         return [], []
     
     url = url or self.url
     br = self.opener.browse_open(url)
     soup = BeautifulSoup(br.response().read())
     
     if not self.check(url, br):
         return [], []
     
     weibo_user = self.get_weibo_user()
     
     html = None
     is_follow = True
     for script in soup.find_all('script'):
         text = script.text
         if 'STK' in text:
             text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
             data = json.loads(text)
             if data['pid'] == 'pl_relation_hisFollow' or \
                 data['pid'] == 'pl_relation_hisFans':
                 html = BeautifulSoup(data['html'])
             if data['pid'] == 'pl_relation_hisFans':
                 is_follow = False    
     
     bundles = []
     ul = html.find(attrs={'class': 'cnfList', 'node-type': 'userListBox'})
     if ul is None:
         return [], bundles
     for li in ul.find_all(attrs={'class': 'S_line1', 'action-type': 'itemClick'}):
         data = dict([l.split('=') for l in li['action-data'].split('&')])
         
         friend = Friend()
         friend.uid = data['uid']
         friend.nickname = data['fnick']
         friend.sex = True if data['sex'] == u'm' else False
         
         bundles.append(WeiboUserBundle(str(friend.uid)))
         if is_follow:
             weibo_user.follows.append(friend)
         else:
             weibo_user.fans.append(friend)
             
     weibo_user.save()
     
     urls = []
     pages = html.find('div', attrs={'class': 'W_pages', 'node-type': 'pageList'})
     if pages is not None:
         a = pages.find_all('a')
         if len(a) > 0:
             next_ = a[-1]
             if next_['class'] == ['W_btn_c']:
                 url = '%s?page=%s' % (
                     url.split('?')[0], 
                     (int(urldecode(url).get('page', 1))+1))
                 urls.append(url)
                 
     return urls, bundles
コード例 #2
0
ファイル: parsers.py プロジェクト: linVdcd/cola
class UserFriendParser(WeiboParser):
    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        try:
            br = self.opener.browse_open(url)
        except Exception as e:
            print(e)
            print('休息10分钟!')
            time.sleep(60 * 10)
#         self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(br.response().read())

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()

        html = None
        decodes = urldecode(url)
        is_follow = True
        is_new_mode = False
        is_banned = True
        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                if is_banned: is_banned = False
                text = text.strip().replace(';', '').replace('FM.view(',
                                                             '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid.startswith('Pl_Official_LeftHisRelation__') or \
                    domid.startswith('Pl_Official_HisRelation__'):
                    html = beautiful_soup(data['html'])
                if 'relate' in decodes and decodes['relate'] == 'fans':
                    is_follow = False
                is_new_mode = True
            elif 'STK' in text:
                if is_banned: is_banned = False
                text = text.replace(
                    'STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                if data['pid'] == 'pl_relation_hisFollow' or \
                    data['pid'] == 'pl_relation_hisFans':
                    html = beautiful_soup(data['html'])
                if data['pid'] == 'pl_relation_hisFans':
                    is_follow = False

        if is_banned:
            print('休息10分钟!')
            time.sleep(60 * 10)
            raise FetchBannedError('fetch banned by weibo server')

        ul = None
        try:
            ul = html.find(attrs={
                'class': 'cnfList',
                'node-type': 'userListBox'
            })
            if ul is None:
                ul = html.find(attrs={
                    'class': 'follow_list',
                    'node-type': 'userListBox'
                })
        except AttributeError, e:
            print('休息10分钟!')
            time.sleep(60 * 10)
            if br.geturl().startswith('http://e.weibo.com'):

                return
            raise e

        if ul is None:
            if is_follow is True:
                if is_new_mode:
                    yield 'http://weibo.com/%s/follow?relate=fans' % self.uid
                else:
                    yield 'http://weibo.com/%s/fans' % self.uid
            return

        current_page = decodes.get('page', 1)
        if current_page == 1:
            if is_follow:
                weibo_user.follows = []
            else:
                weibo_user.fans = []
        for cls in ('S_line1', 'S_line2'):
            for li in ul.find_all(attrs={
                    'class': cls,
                    'action-type': 'itemClick'
            }):
                data = dict(
                    [l.split('=') for l in li['action-data'].split('&')])

                friend = Friend()
                friend.uid = data['uid']
                friend.nickname = data['fnick']
                friend.sex = True if data['sex'] == u'm' else False

                yield WeiboUserBundle(str(friend.uid))
                if is_follow:
                    weibo_user.follows.append(friend)
                else:
                    weibo_user.fans.append(friend)

        weibo_user.save()
        #         self.logger.debug('parse %s finish' % url)

        # counter add one for the friend url
        counter_type = 'follows' if is_follow else 'fans'
        self.counter.inc('processed_%s_list_page' % counter_type, 1)

        pages = html.find('div',
                          attrs={
                              'class': 'W_pages',
                              'node-type': 'pageList'
                          })
        if pages is None:
            pages = html.find('div',
                              attrs={
                                  'class': 'WB_cardpage',
                                  'node-type': 'pageList'
                              })
        if pages is not None:
            a = pages.find_all('a')
            if len(a) > 0:
                next_ = a[-1]
                if next_['class'] == ['W_btn_c'] or 'next' in next_['class']:
                    decodes['page'] = int(decodes.get('page', 1)) + 1
                    query_str = urllib.urlencode(decodes)
                    url = '%s?%s' % (url.split('?')[0], query_str)
                    yield url
                    return

        if is_follow is True:
            if is_new_mode:
                yield 'http://weibo.com/%s/follow?relate=fans' % self.uid
            else:
                yield 'http://weibo.com/%s/fans' % self.uid
コード例 #3
0
ファイル: parsers.py プロジェクト: renchaorevee/cola
                if is_new_mode:
                    urls.append("http://weibo.com/%s/follow?relate=fans" % self.uid)
                else:
                    urls.append("http://weibo.com/%s/fans" % self.uid)
            return urls, bundles

        current_page = decodes.get("page", 1)
        if current_page == 1:
            if is_follow:
                weibo_user.follows = []
            else:
                weibo_user.fans = []
        for li in ul.find_all(attrs={"class": "S_line1", "action-type": "itemClick"}):
            data = dict([l.split("=") for l in li["action-data"].split("&")])

            friend = Friend()
            friend.uid = data["uid"]
            friend.nickname = data["fnick"]
            friend.sex = True if data["sex"] == u"m" else False

            bundles.append(WeiboUserBundle(str(friend.uid)))
            if is_follow:
                weibo_user.follows.append(friend)
            else:
                weibo_user.fans.append(friend)

        weibo_user.save()
        self.logger.debug("parse %s finish" % url)

        urls = []
        pages = html.find("div", attrs={"class": "W_pages", "node-type": "pageList"})
コード例 #4
0
ファイル: parsers.py プロジェクト: huangzhiyong/cola
                    urls.append('http://weibo.com/%s/fans' % self.uid)
            return urls, bundles

        current_page = decodes.get('page', 1)
        if current_page == 1:
            if is_follow:
                weibo_user.follows = []
            else:
                weibo_user.fans = []
        for li in ul.find_all(attrs={
                'class': 'S_line1',
                'action-type': 'itemClick'
        }):
            data = dict([l.split('=') for l in li['action-data'].split('&')])

            friend = Friend()
            friend.uid = data['uid']
            friend.nickname = data['fnick']
            friend.sex = True if data['sex'] == u'm' else False

            bundles.append(WeiboUserBundle(str(friend.uid)))
            if is_follow:
                weibo_user.follows.append(friend)
            else:
                weibo_user.fans.append(friend)

        weibo_user.save()
        self.logger.debug('parse %s finish' % url)

        urls = []
        pages = html.find('div',
コード例 #5
0
ファイル: parsers.py プロジェクト: Chenxofhit/cola
         if is_new_mode:
             urls.append('http://weibo.com/%s/follow?relate=fans' % self.uid)
         else:
             urls.append('http://weibo.com/%s/fans' % self.uid)
     return urls, bundles
 
 current_page = decodes.get('page', 1)
 if current_page == 1:
     if is_follow:
         weibo_user.follows = []
     else:
         weibo_user.fans = []
 for li in ul.find_all(attrs={'class': 'S_line1', 'action-type': 'itemClick'}):
     data = dict([l.split('=') for l in li['action-data'].split('&')])
     
     friend = Friend()
     friend.uid = data['uid']
     friend.nickname = data['fnick']
     friend.sex = True if data['sex'] == u'm' else False
     
     bundles.append(WeiboUserBundle(str(friend.uid)))
     if is_follow:
         weibo_user.follows.append(friend)
     else:
         weibo_user.fans.append(friend)
         
 weibo_user.save()
 self.logger.debug('parse %s finish' % url)
 
 urls = []
 pages = html.find('div', attrs={'class': 'W_pages', 'node-type': 'pageList'})
コード例 #6
0
 
 current_page = decodes.get('page', 1)
 if current_page == 1:
     if is_follow:
         weibo_user.follows = []
     else:
         weibo_user.fans = []
 #
 #
 span = ul.find(attrs={'class': 'addr'})
 #
 #
 for li in ul.find_all(attrs={'class': 'S_line1', 'action-type': 'itemClick'}):
     data = dict([l.split('=') for l in li['action-data'].split('&')])
     
     friend = Friend()
     #friend.uid = data['uid']
     #friend.nickname = data['fnick']
     #friend.sex = True if data['sex'] == u'm' else False
     tempuid = data['uid']
     tempnickname = data['fnick']
     tempsex = True if data['sex'] == u'm' else False
     #
     #location
     #
     locationok = False
     #span = ul.find(str(tempuid)).find(attrs={'class': 'addr'})
     for j in span.stripped_strings:
         templocation = repr(j)
         templocation=eval(templocation)
         #print templocation
コード例 #7
0
ファイル: parsers.py プロジェクト: hewayGitHub/2014-cola
                if is_new_mode:
                    urls.append('http://weibo.com/%s/follow?relate=fans' % self.uid)
                else:
                    urls.append('http://weibo.com/%s/fans' % self.uid)
            return urls, bundles
        
        current_page = decodes.get('page', 1)
        if current_page == 1:
            if is_follow:
                weibo_user.follows = []
            else:
                weibo_user.fans = []
        for li in ul.find_all(attrs={'class': 'S_line2', 'action-type': 'itemClick'}):
            data = dict([l.split('=') for l in li['action-data'].split('&')])
            
            friend = Friend()
            friend.uid = data['uid']
            friend.nickname = data['fnick']
            friend.sex = True if data['sex'] == u'm' else False
            links = li.find('div', attrs={'class': 'info_connect'}).find_all('a')

            if len(links) == 3:
                friend.n_follows = int(links[0].text)
                friend.n_fans = int(links[1].text)
                friend.n_weibos = int(links[2].text)

            #bundles.append(WeiboUserBundle(str(friend.uid)))
            if is_follow:
                bundles.append(WeiboUserBundle(str(friend.uid), level=MAX_LEVEL))
                weibo_user.follows.append(friend)
            else:
コード例 #8
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return

        url = url or self.url
        br = self.opener.browse_open(url)
        soup = BeautifulSoup(br.response().read())

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()

        html = None
        is_follow = True
        for script in soup.find_all('script'):
            text = script.text
            if 'STK' in text:
                text = text.replace(
                    'STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                if data['pid'] == 'pl_relation_hisFollow' or \
                    data['pid'] == 'pl_relation_hisFans':
                    html = BeautifulSoup(data['html'])
                if data['pid'] == 'pl_relation_hisFans':
                    is_follow = False

        bundles = []
        ul = html.find(attrs={'class': 'cnfList', 'node-type': 'userListBox'})
        for li in ul.find_all(attrs={
                'class': 'S_line1',
                'action-type': 'itemClick'
        }):
            data = dict([l.split('=') for l in li['action-data'].split('&')])

            friend = Friend()
            friend.uid = data['uid']
            friend.nickname = data['fnick']
            friend.sex = True if data['sex'] == u'm' else False

            bundles.append(WeiboUserBundle(str(friend.uid)))
            if is_follow:
                weibo_user.follows.append(friend)
            else:
                weibo_user.fans.append(friend)

        weibo_user.save()

        urls = []
        pages = html.find('div',
                          attrs={
                              'class': 'W_pages',
                              'node-type': 'pageList'
                          })
        if pages is not None:
            a = pages.find_all('a')
            if len(a) > 0:
                next_ = a[-1]
                if next_['class'] == ['W_btn_c']:
                    url = next_['href']
                    if not url.startswith('http://'):
                        url = urlparse.urljoin('http://weibo.com', url)
                    urls.append(url)

        return urls, bundles
コード例 #9
0
ファイル: parsers.py プロジェクト: keyihao/Weibo_Cola
        current_page = decodes.get('page', 1)
        if current_page == 1:
            if is_follow:
                weibo_user.follows = []
            else:
                weibo_user.fans = []

        urls = []

        for li in ul.find_all(attrs={'class': 'S_line1'}):
            data = dict([l.split('=') for l in li['action-data'].split('&')])
            
            location_span = li.find('span',attrs={'class': 'addr'})
            location = location_span.text.strip()
            
            friend = Friend()
            friend.uid = data['uid']
            friend.nickname = data['fnick']
            if data['sex'] == u'm':
                friend.sex = u'男'  
            else:
                friend.sex = u'女'
            friend.location = location

            weibo_map = {
	        u'关注': {'field': 'follow_num'},
                u'粉丝': {'field': 'fans_num'},
                u'微博': {'field': 'weibo_num'},
	    }
            connect_div = li.find('div',attrs={'class': 'connect'})
            connect = connect_div.text.replace('  ',' ').replace('\t','').replace('\n','').strip()