Ejemplo n.º 1
0
                    urls.append('http://weibo.com/%s/fans' % self.uid)
            return urls, bundles

        current_page = decodes.get('page', 1)
        if current_page == 1:
            if is_follow:
                weibo_user.follows = []
            else:
                weibo_user.fans = []
        for li in ul.find_all(attrs={
                'class': 'S_line1',
                'action-type': 'itemClick'
        }):
            data = dict([l.split('=') for l in li['action-data'].split('&')])

            friend = Friend()
            friend.uid = data['uid']
            friend.nickname = data['fnick']
            friend.sex = True if data['sex'] == u'm' else False

            bundles.append(WeiboUserBundle(str(friend.uid)))
            if is_follow:
                weibo_user.follows.append(friend)
            else:
                weibo_user.fans.append(friend)

        weibo_user.save()
        self.logger.debug('parse %s finish' % url)

        urls = []
        pages = html.find('div',
Ejemplo n.º 2
0
class UserFriendParser(WeiboParser):
    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        try:
            br = self.opener.browse_open(url)
        except Exception as e:
            print(e)
            print('休息10分钟!')
            time.sleep(60 * 10)
#         self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(br.response().read())

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()

        html = None
        decodes = urldecode(url)
        is_follow = True
        is_new_mode = False
        is_banned = True
        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                if is_banned: is_banned = False
                text = text.strip().replace(';', '').replace('FM.view(',
                                                             '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid.startswith('Pl_Official_LeftHisRelation__') or \
                    domid.startswith('Pl_Official_HisRelation__'):
                    html = beautiful_soup(data['html'])
                if 'relate' in decodes and decodes['relate'] == 'fans':
                    is_follow = False
                is_new_mode = True
            elif 'STK' in text:
                if is_banned: is_banned = False
                text = text.replace(
                    'STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                if data['pid'] == 'pl_relation_hisFollow' or \
                    data['pid'] == 'pl_relation_hisFans':
                    html = beautiful_soup(data['html'])
                if data['pid'] == 'pl_relation_hisFans':
                    is_follow = False

        if is_banned:
            print('休息10分钟!')
            time.sleep(60 * 10)
            raise FetchBannedError('fetch banned by weibo server')

        ul = None
        try:
            ul = html.find(attrs={
                'class': 'cnfList',
                'node-type': 'userListBox'
            })
            if ul is None:
                ul = html.find(attrs={
                    'class': 'follow_list',
                    'node-type': 'userListBox'
                })
        except AttributeError, e:
            print('休息10分钟!')
            time.sleep(60 * 10)
            if br.geturl().startswith('http://e.weibo.com'):

                return
            raise e

        if ul is None:
            if is_follow is True:
                if is_new_mode:
                    yield 'http://weibo.com/%s/follow?relate=fans' % self.uid
                else:
                    yield 'http://weibo.com/%s/fans' % self.uid
            return

        current_page = decodes.get('page', 1)
        if current_page == 1:
            if is_follow:
                weibo_user.follows = []
            else:
                weibo_user.fans = []
        for cls in ('S_line1', 'S_line2'):
            for li in ul.find_all(attrs={
                    'class': cls,
                    'action-type': 'itemClick'
            }):
                data = dict(
                    [l.split('=') for l in li['action-data'].split('&')])

                friend = Friend()
                friend.uid = data['uid']
                friend.nickname = data['fnick']
                friend.sex = True if data['sex'] == u'm' else False

                yield WeiboUserBundle(str(friend.uid))
                if is_follow:
                    weibo_user.follows.append(friend)
                else:
                    weibo_user.fans.append(friend)

        weibo_user.save()
        #         self.logger.debug('parse %s finish' % url)

        # counter add one for the friend url
        counter_type = 'follows' if is_follow else 'fans'
        self.counter.inc('processed_%s_list_page' % counter_type, 1)

        pages = html.find('div',
                          attrs={
                              'class': 'W_pages',
                              'node-type': 'pageList'
                          })
        if pages is None:
            pages = html.find('div',
                              attrs={
                                  'class': 'WB_cardpage',
                                  'node-type': 'pageList'
                              })
        if pages is not None:
            a = pages.find_all('a')
            if len(a) > 0:
                next_ = a[-1]
                if next_['class'] == ['W_btn_c'] or 'next' in next_['class']:
                    decodes['page'] = int(decodes.get('page', 1)) + 1
                    query_str = urllib.urlencode(decodes)
                    url = '%s?%s' % (url.split('?')[0], query_str)
                    yield url
                    return

        if is_follow is True:
            if is_new_mode:
                yield 'http://weibo.com/%s/follow?relate=fans' % self.uid
            else:
                yield 'http://weibo.com/%s/fans' % self.uid
Ejemplo n.º 3
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return

        url = url or self.url
        br = self.opener.browse_open(url)
        soup = BeautifulSoup(br.response().read())

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()

        html = None
        is_follow = True
        for script in soup.find_all('script'):
            text = script.text
            if 'STK' in text:
                text = text.replace(
                    'STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                if data['pid'] == 'pl_relation_hisFollow' or \
                    data['pid'] == 'pl_relation_hisFans':
                    html = BeautifulSoup(data['html'])
                if data['pid'] == 'pl_relation_hisFans':
                    is_follow = False

        bundles = []
        ul = html.find(attrs={'class': 'cnfList', 'node-type': 'userListBox'})
        for li in ul.find_all(attrs={
                'class': 'S_line1',
                'action-type': 'itemClick'
        }):
            data = dict([l.split('=') for l in li['action-data'].split('&')])

            friend = Friend()
            friend.uid = data['uid']
            friend.nickname = data['fnick']
            friend.sex = True if data['sex'] == u'm' else False

            bundles.append(WeiboUserBundle(str(friend.uid)))
            if is_follow:
                weibo_user.follows.append(friend)
            else:
                weibo_user.fans.append(friend)

        weibo_user.save()

        urls = []
        pages = html.find('div',
                          attrs={
                              'class': 'W_pages',
                              'node-type': 'pageList'
                          })
        if pages is not None:
            a = pages.find_all('a')
            if len(a) > 0:
                next_ = a[-1]
                if next_['class'] == ['W_btn_c']:
                    url = next_['href']
                    if not url.startswith('http://'):
                        url = urlparse.urljoin('http://weibo.com', url)
                    urls.append(url)

        return urls, bundles