def __init__(self):
     self.charset = 'utf-8'
     self.parser = CommonBlogParser()
     self.downloader = Downloader()
     # 设置页面url加载的参数
     self.http_params = {
         'is_search': '0',
         'visible': '0',
         'is_tag': '0',
         'profile_ftype': 1,
         'pagebar': '',
         'pre_page': '0',
         'page': 1
     }
     self.uid = ''
 def __init__(self):
     self.charset = 'utf-8'
     self.parser = CommonBlogParser()
     self.downloader = Downloader()
     # 设置页面url加载的参数
     self.http_params = {
         'is_search': '0',
         'visible': '0',
         'is_tag': '0',
         'profile_ftype': 1,
         'pagebar': '',
         'pre_page': '0',
         'page': 1
     }
     self.uid = ''
        print 'rcc is :' + blogmsg['rcc']
        print 'rpt is :' + blogmsg['rpt']
        print 'rpage is :' + blogmsg['rpage']
        print 'rc  is :' + blogmsg['rc']
        print 'cc  is :' + blogmsg['cc']
        print 'page is :' + blogmsg['page']
        print 'pt  is :' + blogmsg['pt']
        print 'srn is :' + blogmsg['srn']
        print '======================================'


if __name__ == '__main__':
    import sys, os

    sys.path.append(os.path.abspath('../'))
    from toolkit.downloader import Downloader
    from toolkit.accountlib import AccountAssistant

    assistant = AccountAssistant()
    from officeblogparser import OfficeBlogParser

    parser = OfficeBlogParser()
    assistant.init()
    assistant.login()
    url = 'http://weibo.com/p/1002061649159940/weibo?is_tag=0&is_search=0&pre_page=0&profile_ftype=1&visible=0&pagebar=&page=1'
    downloader = Downloader()
    content = downloader.download(url)
    parser.init_user('1649159940')
    blog_list = parser.parse(content)
    #parser.print_blog()
        print 'rcc is :' + blogmsg['rcc']
        print 'rpt is :' + blogmsg['rpt']
        print 'rpage is :' + blogmsg['rpage'] ;
        print 'rc  is :' + blogmsg['rc']
        print 'cc  is :' + blogmsg['cc']
        print 'page is :' + blogmsg['page'] ;
        print 'pt  is :' + blogmsg['pt']
        print 'srn is :' + blogmsg['srn'] ;
        print '======================================'


if __name__ == '__main__':
    import sys, os

    sys.path.append(os.path.abspath('../'))
    from toolkit.downloader import Downloader
    from toolkit.accountlib import AccountAssistant

    assistant = AccountAssistant()
    from officeblogparser import OfficeBlogParser

    parser = OfficeBlogParser()
    assistant.init()
    assistant.login()
    url = 'http://weibo.com/p/1002061649159940/weibo?is_tag=0&is_search=0&pre_page=0&profile_ftype=1&visible=0&pagebar=&page=1'
    downloader = Downloader()
    content = downloader.download(url)
    parser.init_user('1649159940')
    blog_list = parser.parse(content)
    #parser.print_blog()
Esempio n. 5
0
 def __init__(self):
     #cookie="SINAGLOBAL=3670791019162.063.1432519568807; ULV=1434184446636:3:1:1:9776813265987.371.1434184446576:1432539758675; SUHB=0S7S3YyGl7ABmk; YF-Ugrow-G0=169004153682ef91866609488943c77f; SUS=SID-5513307770-1434867321-GZ-fdui1-417cce02c02cba62afb4b09ce64141b5; SUE=es%3D77e325518a1eeaab4d42c04535d022d9%26ev%3Dv1%26es2%3Dda7c170b38a64fa4d9b6668f496fa074%26rs0%3DzdWWsJgKtTVoMTjEP3CWSLj5LpFJ5UF0%252BWyN6Q8Sd35saJbSk7N2YdacjGPXamqnsYetxrZNNIwMVsz0JNGf%252FkJZ%252FIv1Bh9YQHxwFkUE3K1i7kZDBboUO0yOR%252Fz0Ucw37WwoeeAGM28l5q%252FSbHFjWwe%252F3DJSj1ZdRE59Qrdrt%252Fo%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1434867321%26et%3D1434953721%26d%3Dc909%26i%3D41b5%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D17%26st%3D0%26uid%3D5513307770%26name%3Dmkqtx11141ua%2540163.com%26nick%3D%25E6%25AF%2581%25E9%25A6%2599%25E5%258A%2588%25E5%25BC%25B9%26fmp%3D%26lcp%3D2015-03-20%252000%253A59%253A43; YF-V5-G0=d22a701aae075ca04c11f0ef68835839; _s_tentry=login.sina.com.cn; UOR=,,login.sina.com.cn; Apache=9776813265987.371.1434184446576; YF-Page-G0=8fee13afa53da91ff99fc89cc7829b07; WBStore=0d3077cd0cad2262|undefined; SUB=_2A254giYpDeTxGeNL6lES8CnLzDyIHXVb9hDhrDV8PUJbvNBeLRjHkW8xuQX_wA9ncQZsaP1yWBfuXyq9-w..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5iPVj051o7KP9Hly_f8Jud5JpX5K-t; ALF=1466403316; SSOLoginState=1434867321; wvr=6"
     #cookie = "SINAGLOBAL  =  1791602201041.3557.1455610750935 ;ULV  = 1455611177148:2:2:2:7716728692915.958.1455611177145:1455610750940  ;SUBP  =  0033WrSXqPxfM725Ws9jqgMF55529P9D9WWNbPECqqxh2rppyWaDQBvZ5JpX5KMt  ;SUHB  =  0jBJqq9P-KpPgN  ;un = [email protected] ;wvr  = 6    ;SUS  =  SID-1340714021-1455611173-GZ-b2ey8-468e97b8ca4455bc4ba3beddabec7cd6  ;SUE   = es%3D8484201c133ec33b03f1ed14aa4534fa%26ev%3Dv1%26es2%3D33ba64a44d9ac86cf555cf05bc397327%26rs0%3DM3QtGCbcUToEqKLA6eAZVpMrEX7u4bQVwvi5fHwr4DhrFNaB0594dwFDsL2CZMg5fRLrIkFt3zc9Bx10kzDewhd7AbovJSdm8cKV0c4V1VEfND1YM3XwCaiwZgbhwWc6jXLCbykNpryMLWTdianTFmPUmFrF0%252BazZmYEFLfT7ww%253D%26rv%3D0   ;SUP   = cv%3D1%26bt%3D1455611174%26et%3D1455697574%26d%3Dc909%26i%3D7cd6%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D0%26st%3D0%26uid%3D1340714021%26name%3Dguanglingsan1988%2540sina.com%26nick%3Dschumacher%26fmp%3D%26lcp%3D2012-02-02%252019%253A20%253A09    ;SUB   =  _2A257xq12DeRxGedN71IW8SrMyT2IHXVYtZm-rDV8PUNbvtBeLRnGkW9LHet86m9AJ9R6RMhU07ClXHxqCx1S0A..   ;ALF    = 1487147173   ;SSOLoginState   = 1455611174   ;_s_tentry    = login.sina.com.cn   ;UOR   = ,,login.sina.com.cn    ;Apache   =  7716728692915.958.1455611177145 "
     print 'start load cookie'
     #print cookie
     self.downloader = Downloader()  #Downloader(cookie)
Esempio n. 6
0
class UserCrawler(object):
    """
    获取用户个人信息
    """
    def __init__(self):
        #cookie="SINAGLOBAL=3670791019162.063.1432519568807; ULV=1434184446636:3:1:1:9776813265987.371.1434184446576:1432539758675; SUHB=0S7S3YyGl7ABmk; YF-Ugrow-G0=169004153682ef91866609488943c77f; SUS=SID-5513307770-1434867321-GZ-fdui1-417cce02c02cba62afb4b09ce64141b5; SUE=es%3D77e325518a1eeaab4d42c04535d022d9%26ev%3Dv1%26es2%3Dda7c170b38a64fa4d9b6668f496fa074%26rs0%3DzdWWsJgKtTVoMTjEP3CWSLj5LpFJ5UF0%252BWyN6Q8Sd35saJbSk7N2YdacjGPXamqnsYetxrZNNIwMVsz0JNGf%252FkJZ%252FIv1Bh9YQHxwFkUE3K1i7kZDBboUO0yOR%252Fz0Ucw37WwoeeAGM28l5q%252FSbHFjWwe%252F3DJSj1ZdRE59Qrdrt%252Fo%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1434867321%26et%3D1434953721%26d%3Dc909%26i%3D41b5%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D17%26st%3D0%26uid%3D5513307770%26name%3Dmkqtx11141ua%2540163.com%26nick%3D%25E6%25AF%2581%25E9%25A6%2599%25E5%258A%2588%25E5%25BC%25B9%26fmp%3D%26lcp%3D2015-03-20%252000%253A59%253A43; YF-V5-G0=d22a701aae075ca04c11f0ef68835839; _s_tentry=login.sina.com.cn; UOR=,,login.sina.com.cn; Apache=9776813265987.371.1434184446576; YF-Page-G0=8fee13afa53da91ff99fc89cc7829b07; WBStore=0d3077cd0cad2262|undefined; SUB=_2A254giYpDeTxGeNL6lES8CnLzDyIHXVb9hDhrDV8PUJbvNBeLRjHkW8xuQX_wA9ncQZsaP1yWBfuXyq9-w..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5iPVj051o7KP9Hly_f8Jud5JpX5K-t; ALF=1466403316; SSOLoginState=1434867321; wvr=6"
        #cookie = "SINAGLOBAL  =  1791602201041.3557.1455610750935 ;ULV  = 1455611177148:2:2:2:7716728692915.958.1455611177145:1455610750940  ;SUBP  =  0033WrSXqPxfM725Ws9jqgMF55529P9D9WWNbPECqqxh2rppyWaDQBvZ5JpX5KMt  ;SUHB  =  0jBJqq9P-KpPgN  ;un = [email protected] ;wvr  = 6    ;SUS  =  SID-1340714021-1455611173-GZ-b2ey8-468e97b8ca4455bc4ba3beddabec7cd6  ;SUE   = es%3D8484201c133ec33b03f1ed14aa4534fa%26ev%3Dv1%26es2%3D33ba64a44d9ac86cf555cf05bc397327%26rs0%3DM3QtGCbcUToEqKLA6eAZVpMrEX7u4bQVwvi5fHwr4DhrFNaB0594dwFDsL2CZMg5fRLrIkFt3zc9Bx10kzDewhd7AbovJSdm8cKV0c4V1VEfND1YM3XwCaiwZgbhwWc6jXLCbykNpryMLWTdianTFmPUmFrF0%252BazZmYEFLfT7ww%253D%26rv%3D0   ;SUP   = cv%3D1%26bt%3D1455611174%26et%3D1455697574%26d%3Dc909%26i%3D7cd6%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D0%26st%3D0%26uid%3D1340714021%26name%3Dguanglingsan1988%2540sina.com%26nick%3Dschumacher%26fmp%3D%26lcp%3D2012-02-02%252019%253A20%253A09    ;SUB   =  _2A257xq12DeRxGedN71IW8SrMyT2IHXVYtZm-rDV8PUNbvtBeLRnGkW9LHet86m9AJ9R6RMhU07ClXHxqCx1S0A..   ;ALF    = 1487147173   ;SSOLoginState   = 1455611174   ;_s_tentry    = login.sina.com.cn   ;UOR   = ,,login.sina.com.cn    ;Apache   =  7716728692915.958.1455611177145 "
        print 'start load cookie'
        #print cookie
        self.downloader = Downloader()  #Downloader(cookie)

    def _process_html(self, content):
        """
        对下载的网页进行预处理,主要是替换\/为/
        """
        if content:
            return content.replace('\\/', '/')
        return ''

    def get_url(self, upage):
        #return 'http://weibo.com/1340714021/fans?cfs=600&relate=fans&t=1&f=1&type=&Pl_Official_RelationFans__106_page=5#Pl_Official_RelationFans__106'
        #return 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=3783364643412551&max_id=3783417797719751&page='+str(upage)
        return 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=3855095853042261&max_id=3856157879315930&page=' + str(
            upage)

    def _init(self, uid):
        """
        初始化用户信息列表
        """
        self.info = {
            'uid': uid,
            'time': '2014-12-02 16:11:00',
            'info_id': '3741311679142648',
            'source_equip': '皮皮时光机',
            'repost_num': 1880,
            'fui': [],  #一度好友
            '2_fui': []  #N度好友
        }

    def _get_followee(self, uid):
        """
        获取user info repost info
        """
        followee_list = []
        repost_list = list()
        followee_set = set()
        for i in range(1, 50):  #219
            html_data = self.downloader.download(self.get_url(i))
            html_data = self._process_html(html_data)
            #print html_data
            if html_data is not None:
                try:
                    soup = BeautifulSoup(html_data.decode('utf-8', 'ignore'),
                                         "lxml")
                    #followee_html_list= soup.findAll('div')
                    #print followee_html_list
                    followee_html_list = soup.findAll(
                        'div', attrs={'class': 'list_li S_line1 clearfix'})
                    #print followee_html_list
                    for followee_html in followee_html_list:
                        #print followee_html
                        repost_use_list = list()
                        info_connect = followee_html.find(
                            'div', attrs={'class': 'WB_face W_fl'})
                        #print info_connect
                        if info_connect is None: continue
                        if info_connect.find('img') is None:
                            continue
                        else:
                            follow_id = info_connect.find('img')
                            print 'repost', follow_id['alt']
                            print 'image', follow_id['src']
                            #followee_list.append((info_connect['usercard'][3:],info_connect['alt']))

                        after_data = followee_html.find(
                            'div', attrs={'class': 'list_con'})
                        #print after_data
                        repost_2 = after_data.find('span')
                        if repost_2.find('a') is not None:
                            repost_n_user = repost_2.findAll('a')
                            for repost_user in repost_n_user:
                                print '2repost', repost_user.text
                                if repost_user.text.find('http:') >= 0:
                                    continue
                                repost_use_list.append(repost_user.text)
                            #repost_user=repost_2.find('a').text
                            if len(repost_use_list) > 0:
                                repost_list.append((follow_id['usercard'][3:],
                                                    repost_use_list))
                        time_user = after_data.find(
                            'div', attrs={'class': 'WB_from S_txt2'})
                        time_list = time_user.find('a')['title']
                        followee_list.append(
                            (follow_id['usercard'][3:], follow_id['alt'],
                             follow_id['src'], time_list))
                except Exception, e:
                    logging.exception("获取好友列表异常:" + uid + str(e))
        self.info['fui'] = followee_list
        self.info['2_fui'] = repost_list
        print self.info
Esempio n. 7
0
    def __init__(self):
	cookie="SINAGLOBAL=3670791019162.063.1432519568807; ULV=1434184446636:3:1:1:9776813265987.371.1434184446576:1432539758675; SUHB=0S7S3YyGl7ABmk; YF-Ugrow-G0=169004153682ef91866609488943c77f; SUS=SID-5513307770-1434867321-GZ-fdui1-417cce02c02cba62afb4b09ce64141b5; SUE=es%3D77e325518a1eeaab4d42c04535d022d9%26ev%3Dv1%26es2%3Dda7c170b38a64fa4d9b6668f496fa074%26rs0%3DzdWWsJgKtTVoMTjEP3CWSLj5LpFJ5UF0%252BWyN6Q8Sd35saJbSk7N2YdacjGPXamqnsYetxrZNNIwMVsz0JNGf%252FkJZ%252FIv1Bh9YQHxwFkUE3K1i7kZDBboUO0yOR%252Fz0Ucw37WwoeeAGM28l5q%252FSbHFjWwe%252F3DJSj1ZdRE59Qrdrt%252Fo%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1434867321%26et%3D1434953721%26d%3Dc909%26i%3D41b5%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D17%26st%3D0%26uid%3D5513307770%26name%3Dmkqtx11141ua%2540163.com%26nick%3D%25E6%25AF%2581%25E9%25A6%2599%25E5%258A%2588%25E5%25BC%25B9%26fmp%3D%26lcp%3D2015-03-20%252000%253A59%253A43; YF-V5-G0=d22a701aae075ca04c11f0ef68835839; _s_tentry=login.sina.com.cn; UOR=,,login.sina.com.cn; Apache=9776813265987.371.1434184446576; YF-Page-G0=8fee13afa53da91ff99fc89cc7829b07; WBStore=0d3077cd0cad2262|undefined; SUB=_2A254giYpDeTxGeNL6lES8CnLzDyIHXVb9hDhrDV8PUJbvNBeLRjHkW8xuQX_wA9ncQZsaP1yWBfuXyq9-w..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5iPVj051o7KP9Hly_f8Jud5JpX5K-t; ALF=1466403316; SSOLoginState=1434867321; wvr=6"
	print 'start load cookie'
	print cookie
        self.downloader = Downloader(cookie)
Esempio n. 8
0
class UserCrawler(object):
    """
    获取用户个人信息
    """

    def __init__(self):
	cookie="SINAGLOBAL=3670791019162.063.1432519568807; ULV=1434184446636:3:1:1:9776813265987.371.1434184446576:1432539758675; SUHB=0S7S3YyGl7ABmk; YF-Ugrow-G0=169004153682ef91866609488943c77f; SUS=SID-5513307770-1434867321-GZ-fdui1-417cce02c02cba62afb4b09ce64141b5; SUE=es%3D77e325518a1eeaab4d42c04535d022d9%26ev%3Dv1%26es2%3Dda7c170b38a64fa4d9b6668f496fa074%26rs0%3DzdWWsJgKtTVoMTjEP3CWSLj5LpFJ5UF0%252BWyN6Q8Sd35saJbSk7N2YdacjGPXamqnsYetxrZNNIwMVsz0JNGf%252FkJZ%252FIv1Bh9YQHxwFkUE3K1i7kZDBboUO0yOR%252Fz0Ucw37WwoeeAGM28l5q%252FSbHFjWwe%252F3DJSj1ZdRE59Qrdrt%252Fo%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1434867321%26et%3D1434953721%26d%3Dc909%26i%3D41b5%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D17%26st%3D0%26uid%3D5513307770%26name%3Dmkqtx11141ua%2540163.com%26nick%3D%25E6%25AF%2581%25E9%25A6%2599%25E5%258A%2588%25E5%25BC%25B9%26fmp%3D%26lcp%3D2015-03-20%252000%253A59%253A43; YF-V5-G0=d22a701aae075ca04c11f0ef68835839; _s_tentry=login.sina.com.cn; UOR=,,login.sina.com.cn; Apache=9776813265987.371.1434184446576; YF-Page-G0=8fee13afa53da91ff99fc89cc7829b07; WBStore=0d3077cd0cad2262|undefined; SUB=_2A254giYpDeTxGeNL6lES8CnLzDyIHXVb9hDhrDV8PUJbvNBeLRjHkW8xuQX_wA9ncQZsaP1yWBfuXyq9-w..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5iPVj051o7KP9Hly_f8Jud5JpX5K-t; ALF=1466403316; SSOLoginState=1434867321; wvr=6"
	print 'start load cookie'
	print cookie
        self.downloader = Downloader(cookie)

    def _process_html(self, content):
        """
        对下载的网页进行预处理,主要是替换\/为/
        """
        if content:
            return content.replace('\\/', '/')
        return ''

    def get_url(self, upage):
        #return 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=3783364643412551&max_id=3783417797719751&page='+str(upage)
	return 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=3855095853042261&max_id=3856157879315930&page='+str(upage)
    def _init(self, uid):
        """
        初始化用户信息列表
        """
        self.info = {
            'uid': uid,
            'time': '2015-08-18 16:11:00',
            'info_id':'3741311679142648',
            'source_equip':'皮皮时光机',
            'repost_num':1880,
            'fui': [],#一度好友
            '2_fui': []#N度好友
        }

    def _get_followee(self, uid):
        """
        获取user info repost info
        """
        followee_list = []
        repost_list=list()
        followee_set = set()
        for i in range(1,219):
            html_data = self.downloader.download(self.get_url(i))
            html_data = self._process_html(html_data)
	    #print html_data
            if html_data is not None:
                    try:
                        soup = BeautifulSoup(html_data.decode('utf-8', 'ignore'))
                        #followee_html_list= soup.findAll('div')
			#print followee_html_list
                        followee_html_list=soup.findAll('div',attrs={'class':'list_li S_line1 clearfix'})
			#print followee_html_list
                        for followee_html in followee_html_list:
                            #print followee_html
                            repost_use_list=list()
                            info_connect=followee_html.find('div',attrs={'class':'WB_face W_fl'})
			    #print info_connect
                            if info_connect is None:continue
                            if info_connect.find('img') is None:
                                continue
                            else:                              
                                follow_id=info_connect.find('img')                                
		     	   	print 'repost',follow_id['alt']     # username
                                print 'image',follow_id['src']      # image url
                                #followee_list.append((info_connect['usercard'][3:],info_connect['alt']))
			    
                            after_data=followee_html.find('div',attrs={'class':'list_con'})
			    #print after_data
                            repost_2=after_data.find('span')
                            if repost_2.find('a') is not None:
                                repost_n_user=repost_2.findAll('a')
                                for repost_user in repost_n_user:
                                    print '2repost',repost_user.text
                                    if repost_user.text.find('http:')>=0:
                                        continue
                                    repost_use_list.append(repost_user.text)
                                #repost_user=repost_2.find('a').text
                                if len(repost_use_list)>0:
                                    repost_list.append((follow_id['usercard'][3:],repost_use_list))
                            time_user=after_data.find('div',attrs={'class':'WB_from S_txt2'})
                            time_list=time_user.find('a')['title']
                            followee_list.append((follow_id['usercard'][3:],follow_id['alt'],follow_id['src'],time_list))
                    except Exception, e:
                        logging.exception("获取好友列表异常:" + uid + str(e))
        self.info['fui'] = followee_list
        self.info['2_fui']=repost_list
        print self.info
class BlogCrawler(object):
    def __init__(self):
        self.charset = 'utf-8'
        self.parser = CommonBlogParser()
        self.downloader = Downloader()
        # 设置页面url加载的参数
        self.http_params = {
            'is_search': '0',
            'visible': '0',
            'is_tag': '0',
            'profile_ftype': 1,
            'pagebar': '',
            'pre_page': '0',
            'page': 1
        }
        self.uid = ''

    # =========  完成解析用户微博数据前的准备工作  ========#
    def _init_(self, url):
        """
        解析用户微博数据前的准备工作,包括:
        1. 获取当前用户的page_id
        2. 获取当前用户的微博总页数
        """
        http_params = {
            '__rnd': '',
            '_k': '',
            '_t': '0',
            'count': '15',
            'end_id': '',
            'max_id': '',
            'page': 1,
            'pagebar': '',
            'pre_page': '0',
            'profile_ftype': '1',
            'uid': ''
        }
        content = self.downloader.download(url)
        # 判断用户是否存在
        if not self.exist(content):
            raise UserNotFoundError(url)
        # 获取用户ID
        btag = "$CONFIG['oid']='"
        etag = "';"
        bpos = content.find(btag) + len(btag)
        epos = content.find(etag, bpos)
        uid = content[bpos:epos]
        self.uid = uid
        # 获取 page_id
        self.page_id = self._parse_pageid(content)
        # 获取微博总页数
        self.pagenum = self._caculate_pagenum(content)
        # 获取pid,抓取微博所需的domain参数
        self.pid = self._parse_pid(content)
        # 获取用户头像地址和昵称
        img_url, nick_name = self._parse_userinfo(content)
        self.parser.init_user(self.uid, img_url, nick_name)
        self.url = self.get_url()

    def exist(self, content):
        """
        判断当前用户是否存在
        ------------------------------
        return: 用户存在返回True,否则返回False
        """
        if content.find('<title>错误提示') != -1:
            return False
        return True

    def _parse_pageid(self, content):
        """
        解析页面的page_id
        ----------------------------------
        content: 待解析的网页内容
        ----------------------------------
        return: page_id, 或空
        """
        btag = "$CONFIG['page_id']='"
        etag = "'"
        page_id = ''
        if content:
            bpos = content.find(btag)
            if bpos:
                bpos += len(btag)
                epos = content.find(etag, bpos)
                page_id = content[bpos:epos]
        return page_id

    def _caculate_pagenum(self, content):
        """
        计算微博的总页数
        ------------------------------
        return: 微博页数
        """
        msgcount = self._parse_msgcount(content)
        per_pagenum = 45
        total_pagenum = msgcount / per_pagenum
        if msgcount % per_pagenum:
            total_pagenum += 1
        return total_pagenum

    def _parse_msgcount(self, content):
        """
        解析微博条数
        ---------------------------
        content: 网页文本
        ---------------------------
        return: 微博条数
        """
        if not content:
            raise PreprocessError(self.uid)
        etag1 = '>微博<\/span>'
        etag2 = '<\/strong>'
        btag = '>'
        epos = content.find(etag1)
        epos = content[:epos].rfind(etag2)
        bpos = content[:epos].rfind(btag) + len(btag)
        return int(content[bpos:epos])

    def _parse_userinfo(self, content):
        """
        解析用户的头像地址/用户昵称
        -----------------------------
        content: 网页文本
        ------------------------------
        return: (img_url, nick_name)
        """
        btag = '<div class="pf_photo"'
        etag = '<\/div>'
        bpos = content.find(btag)
        epos = content.find(etag, bpos)
        soup = BeautifulSoup(content[bpos:epos].replace('\\/', '/') + '</div>')
        img_url = soup.img['src']
        nick_name = soup.img['alt']
        return img_url, nick_name

    #========   解析用户的微博数据,并保存结果   =======#
    def scratch(self, url, start_pageindex=1):
        """
        获取给定用户的所有满足条件的微博,并写入文件
        ----------------------------------------
        uid: 待抓取微博数据的用户ID
        start_pageindex: 从第几页开始抓取用户的微博数据
        """
        self._init_(url)
        from controller import Controller
        # print Controller._get_filepath(self.uid)
        if os.path.isfile(Controller._get_filepath(self.uid)):  # 用户微博已下载
            print self.uid, u'用户的微博已下载!'
            return None
        if start_pageindex > self.pagenum:
            return []
        #return self._binary_scratch(uid, start_pageindex)
        return self._sequence_scratch(self.uid, start_pageindex, self.pagenum)

    def _binary_scratch(self, uid, start_pageindex):
        """
        执行二分式的抓取策略,从页码的中间偏后的位置开始抓取。
        现在从总页面数的三分之一处开始抓取数据。
        ----------------------------------------------
        start_pageindex: 起始页码
        ---------------------------------------------
        return: blogs
        """
        mid_pageindex = max((start_pageindex + self.pagenum) / 3, 1)
        # 从前往后抓取微博
        blogs1 = self._sequence_scratch(uid, mid_pageindex, self.pagenum, 1)
        # 从后往前抓取微博
        if mid_pageindex > start_pageindex:
            blogs2 = self._sequence_scratch(uid, mid_pageindex - 1,
                                            start_pageindex, -1)
            blogs1.extend(blogs2)
        return blogs1

    def _sequence_scratch(self,
                          uid,
                          start_pageindex,
                          end_pageindex,
                          direction=1):
        """
        执行顺序抓取策略,按照顺序从前往后或者从后往前抓取
        ---------------------------------------------------
        uid: 待抓取的用户ID
        start_pageindex: 起始页码
        end_pageindex: 结束页面
        direction: 抓取的方向, 1->从前往后,pageindex递增;-1->从后往前,pageindex递减
        ---------------------------------------------------
        return: blogs
        """
        blogs = []
        for pageindex in range(start_pageindex, end_pageindex + direction,
                               direction):
            temp_blogs = self._parse_blogs(pageindex)
            print uid + ':获取第' + str(pageindex) + '页微博成功.'
            blogs.extend(temp_blogs)
            time.sleep(1)
            if not self._continue(temp_blogs, direction):
                break
        return blogs

    def _parse_blogs(self, pageindex):
        """
        获取指定微博页面的三个子页的微博内容
        -----------------------------------
        return: 该页的微博列表
        """
        blogs = []
        self.http_params['page'] = pageindex
        self.http_params['id'] = self.page_id
        self.http_params['domain'] = self.pid
        # 下载第一页
        self.http_params['pre_page'] = self.http_params['page'] - 1
        content = self.downloader.download(self.url, self.http_params)
        if content:
            sub_blogs = self.parser.parse(content)
            blogs.extend(sub_blogs)
        if not self._continue(blogs):
            return blogs
        # 下载第二页
        self.http_params['count'] = '15'
        self.http_params['pagebar'] = '0'
        self.http_params['pre_page'] = self.http_params['page']
        content = self.downloader.download(self.url, self.http_params)
        if content:
            sub_blogs = self.parser.parse(content)
            blogs.extend(sub_blogs)
            if not self._continue(sub_blogs):
                return blogs
        # 下载第三页
        self.http_params['count'] = '15'
        self.http_params['pagebar'] = '1'
        self.http_params['pre_page'] = self.http_params['page']
        content = self.downloader.download(self.url, self.http_params)
        if content:
            sub_blogs = self.parser.parse(content)
            blogs.extend(sub_blogs)
        return blogs

    def _continue(self, blogs, direction=1):
        """
        判断是否需要继续进行下载任务
        -----------------------------
        blogs: 待判断的博客列表,按照时间先后顺序排列
        direction: 判别的方向,1->判定最后一条微博是否比起始时间早;
            -1->判定第一条微博是否比结束时间晚;
        ------------------------------
        return: 继续返回True,否则返回False
        """
        is_continue = True
        if blogs:
            if (direction == -1 and dt.compare(blogs[0]['pt'], config.end_time) > 0) or \
                    (direction == 1 and dt.compare(blogs[-1]['pt'], config.begin_time) < 0):
                is_continue = False
        return is_continue

    def get_url(self):
        """
        获取下载的网页地址
        """
        # url = 'http://weibo.com/p/' + self.page_id + '/weibo'
        url = 'http://weibo.com/p/aj/v6/mblog/mbloglist'
        return url

    def _parse_pid(self, content):
        btag = "$CONFIG['pid']='"
        etag = "'"
        pid = ''
        if content:
            bpos = content.find(btag)
            if bpos:
                bpos += len(btag)
                epos = content.find(etag, bpos)
                pid = content[bpos:epos]
        return pid
Esempio n. 10
0
class BlogCrawler(object):
    def __init__(self):
        self.charset = 'utf-8'
        self.parser = CommonBlogParser()
        self.downloader = Downloader()
        # 设置页面url加载的参数
        self.http_params = {
            'is_search': '0',
            'visible': '0',
            'is_tag': '0',
            'profile_ftype': 1,
            'pagebar': '',
            'pre_page': '0',
            'page': 1
        }
        self.uid = ''

    # =========  完成解析用户微博数据前的准备工作  ========#
    def _init_(self, url):
        """
        解析用户微博数据前的准备工作,包括:
        1. 获取当前用户的page_id
        2. 获取当前用户的微博总页数
        """
        http_params = {
            '__rnd': '',
            '_k': '',
            '_t': '0',
            'count': '15',
            'end_id': '',
            'max_id': '',
            'page': 1,
            'pagebar': '',
            'pre_page': '0',
            'profile_ftype': '1',
            'uid': ''
        }
        content = self.downloader.download(url)
        # 判断用户是否存在
        if not self.exist(content):
            raise UserNotFoundError(url)
        # 获取用户ID
        btag = "$CONFIG['oid']='"
        etag = "';"
        bpos = content.find(btag) + len(btag)
        epos = content.find(etag, bpos)
        uid = content[bpos:epos]
        self.uid = uid
        # 获取 page_id
        self.page_id = self._parse_pageid(content)
        # 获取微博总页数
        self.pagenum = self._caculate_pagenum(content)
        # 获取pid,抓取微博所需的domain参数
        self.pid = self._parse_pid(content)
        # 获取用户头像地址和昵称
        img_url, nick_name = self._parse_userinfo(content)
        self.parser.init_user(self.uid, img_url, nick_name)
        self.url = self.get_url()

    def exist(self, content):
        """
        判断当前用户是否存在
        ------------------------------
        return: 用户存在返回True,否则返回False
        """
        if content.find('<title>错误提示') != -1:
            return False
        return True

    def _parse_pageid(self, content):
        """
        解析页面的page_id
        ----------------------------------
        content: 待解析的网页内容
        ----------------------------------
        return: page_id, 或空
        """
        btag = "$CONFIG['page_id']='"
        etag = "'"
        page_id = ''
        if content:
            bpos = content.find(btag)
            if bpos:
                bpos += len(btag)
                epos = content.find(etag, bpos)
                page_id = content[bpos:epos]
        return page_id

    def _caculate_pagenum(self, content):
        """
        计算微博的总页数
        ------------------------------
        return: 微博页数
        """
        msgcount = self._parse_msgcount(content)
        per_pagenum = 45
        total_pagenum = msgcount / per_pagenum
        if msgcount % per_pagenum:
            total_pagenum += 1
        return total_pagenum

    def _parse_msgcount(self, content):
        """
        解析微博条数
        ---------------------------
        content: 网页文本
        ---------------------------
        return: 微博条数
        """
        if not content:
            raise PreprocessError(self.uid)
        etag1 = '>微博<\/span>'
        etag2 = '<\/strong>'
        btag = '>'
        epos = content.find(etag1)
        epos = content[:epos].rfind(etag2)
        bpos = content[:epos].rfind(btag) + len(btag)
        return int(content[bpos:epos])

    def _parse_userinfo(self, content):
        """
        解析用户的头像地址/用户昵称
        -----------------------------
        content: 网页文本
        ------------------------------
        return: (img_url, nick_name)
        """
        btag = '<div class="pf_photo"'
        etag = '<\/div>'
        bpos = content.find(btag)
        epos = content.find(etag, bpos)
        soup = BeautifulSoup(content[bpos:epos].replace('\\/', '/') + '</div>')
        img_url = soup.img['src']
        nick_name = soup.img['alt']
        return img_url, nick_name

    #========   解析用户的微博数据,并保存结果   =======#
    def scratch(self, url, start_pageindex=1):
        """
        获取给定用户的所有满足条件的微博,并写入文件
        ----------------------------------------
        uid: 待抓取微博数据的用户ID
        start_pageindex: 从第几页开始抓取用户的微博数据
        """
        self._init_(url)
        from controller import Controller
        # print Controller._get_filepath(self.uid)
        if os.path.isfile(Controller._get_filepath(self.uid)):  # 用户微博已下载
            print self.uid, u'用户的微博已下载!'
            return None
        if start_pageindex > self.pagenum:
            return []
        #return self._binary_scratch(uid, start_pageindex)
        return self._sequence_scratch(self.uid, start_pageindex, self.pagenum)

    def _binary_scratch(self, uid, start_pageindex):
        """
        执行二分式的抓取策略,从页码的中间偏后的位置开始抓取。
        现在从总页面数的三分之一处开始抓取数据。
        ----------------------------------------------
        start_pageindex: 起始页码
        ---------------------------------------------
        return: blogs
        """
        mid_pageindex = max((start_pageindex + self.pagenum) / 3, 1)
        # 从前往后抓取微博
        blogs1 = self._sequence_scratch(uid, mid_pageindex, self.pagenum, 1)
        # 从后往前抓取微博
        if mid_pageindex > start_pageindex:
            blogs2 = self._sequence_scratch(uid, mid_pageindex - 1, start_pageindex, -1)
            blogs1.extend(blogs2)
        return blogs1

    def _sequence_scratch(self, uid, start_pageindex, end_pageindex, direction=1):
        """
        执行顺序抓取策略,按照顺序从前往后或者从后往前抓取
        ---------------------------------------------------
        uid: 待抓取的用户ID
        start_pageindex: 起始页码
        end_pageindex: 结束页面
        direction: 抓取的方向, 1->从前往后,pageindex递增;-1->从后往前,pageindex递减
        ---------------------------------------------------
        return: blogs
        """
        blogs = []
        for pageindex in range(start_pageindex, end_pageindex + direction, direction):
            temp_blogs = self._parse_blogs(pageindex)
            print uid + ':获取第' + str(pageindex) + '页微博成功.'
            blogs.extend(temp_blogs)
            time.sleep(1)
            if not self._continue(temp_blogs, direction):
                break
        return blogs

    def _parse_blogs(self, pageindex):
        """
        获取指定微博页面的三个子页的微博内容
        -----------------------------------
        return: 该页的微博列表
        """
        blogs = []
        self.http_params['page'] = pageindex
        self.http_params['id'] = self.page_id
        self.http_params['domain'] = self.pid
        # 下载第一页
        self.http_params['pre_page'] = self.http_params['page'] - 1
        content = self.downloader.download(self.url, self.http_params)
        if content:
            sub_blogs = self.parser.parse(content)
            blogs.extend(sub_blogs)
        if not self._continue(blogs):
            return blogs
        # 下载第二页
        self.http_params['count'] = '15'
        self.http_params['pagebar'] = '0'
        self.http_params['pre_page'] = self.http_params['page']
        content = self.downloader.download(self.url, self.http_params)
        if content:
            sub_blogs = self.parser.parse(content)
            blogs.extend(sub_blogs)
            if not self._continue(sub_blogs):
                return blogs
        # 下载第三页
        self.http_params['count'] = '15'
        self.http_params['pagebar'] = '1'
        self.http_params['pre_page'] = self.http_params['page']
        content = self.downloader.download(self.url, self.http_params)
        if content:
            sub_blogs = self.parser.parse(content)
            blogs.extend(sub_blogs)
        return blogs

    def _continue(self, blogs, direction=1):
        """
        判断是否需要继续进行下载任务
        -----------------------------
        blogs: 待判断的博客列表,按照时间先后顺序排列
        direction: 判别的方向,1->判定最后一条微博是否比起始时间早;
            -1->判定第一条微博是否比结束时间晚;
        ------------------------------
        return: 继续返回True,否则返回False
        """
        is_continue = True
        if blogs:
            if (direction == -1 and dt.compare(blogs[0]['pt'], config.end_time) > 0) or \
                    (direction == 1 and dt.compare(blogs[-1]['pt'], config.begin_time) < 0):
                is_continue = False
        return is_continue

    def get_url(self):
        """
        获取下载的网页地址
        """
        # url = 'http://weibo.com/p/' + self.page_id + '/weibo'
        url = 'http://weibo.com/p/aj/v6/mblog/mbloglist'
        return url

    def _parse_pid(self, content):
        btag = "$CONFIG['pid']='"
        etag = "'"
        pid = ''
        if content:
            bpos = content.find(btag)
            if bpos:
                bpos += len(btag)
                epos = content.find(etag, bpos)
                pid = content[bpos:epos]
        return pid
 def __init__(self):
     #cookie="SINAGLOBAL=3670791019162.063.1432519568807; ULV=1434184446636:3:1:1:9776813265987.371.1434184446576:1432539758675; SUHB=0S7S3YyGl7ABmk; YF-Ugrow-G0=169004153682ef91866609488943c77f; SUS=SID-5513307770-1434867321-GZ-fdui1-417cce02c02cba62afb4b09ce64141b5; SUE=es%3D77e325518a1eeaab4d42c04535d022d9%26ev%3Dv1%26es2%3Dda7c170b38a64fa4d9b6668f496fa074%26rs0%3DzdWWsJgKtTVoMTjEP3CWSLj5LpFJ5UF0%252BWyN6Q8Sd35saJbSk7N2YdacjGPXamqnsYetxrZNNIwMVsz0JNGf%252FkJZ%252FIv1Bh9YQHxwFkUE3K1i7kZDBboUO0yOR%252Fz0Ucw37WwoeeAGM28l5q%252FSbHFjWwe%252F3DJSj1ZdRE59Qrdrt%252Fo%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1434867321%26et%3D1434953721%26d%3Dc909%26i%3D41b5%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D17%26st%3D0%26uid%3D5513307770%26name%3Dmkqtx11141ua%2540163.com%26nick%3D%25E6%25AF%2581%25E9%25A6%2599%25E5%258A%2588%25E5%25BC%25B9%26fmp%3D%26lcp%3D2015-03-20%252000%253A59%253A43; YF-V5-G0=d22a701aae075ca04c11f0ef68835839; _s_tentry=login.sina.com.cn; UOR=,,login.sina.com.cn; Apache=9776813265987.371.1434184446576; YF-Page-G0=8fee13afa53da91ff99fc89cc7829b07; WBStore=0d3077cd0cad2262|undefined; SUB=_2A254giYpDeTxGeNL6lES8CnLzDyIHXVb9hDhrDV8PUJbvNBeLRjHkW8xuQX_wA9ncQZsaP1yWBfuXyq9-w..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5iPVj051o7KP9Hly_f8Jud5JpX5K-t; ALF=1466403316; SSOLoginState=1434867321; wvr=6"
     #cookie = "SINAGLOBAL  =  1791602201041.3557.1455610750935 ;ULV  = 1455611177148:2:2:2:7716728692915.958.1455611177145:1455610750940  ;SUBP  =  0033WrSXqPxfM725Ws9jqgMF55529P9D9WWNbPECqqxh2rppyWaDQBvZ5JpX5KMt  ;SUHB  =  0jBJqq9P-KpPgN  ;un = [email protected] ;wvr  = 6    ;SUS  =  SID-1340714021-1455611173-GZ-b2ey8-468e97b8ca4455bc4ba3beddabec7cd6  ;SUE   = es%3D8484201c133ec33b03f1ed14aa4534fa%26ev%3Dv1%26es2%3D33ba64a44d9ac86cf555cf05bc397327%26rs0%3DM3QtGCbcUToEqKLA6eAZVpMrEX7u4bQVwvi5fHwr4DhrFNaB0594dwFDsL2CZMg5fRLrIkFt3zc9Bx10kzDewhd7AbovJSdm8cKV0c4V1VEfND1YM3XwCaiwZgbhwWc6jXLCbykNpryMLWTdianTFmPUmFrF0%252BazZmYEFLfT7ww%253D%26rv%3D0   ;SUP   = cv%3D1%26bt%3D1455611174%26et%3D1455697574%26d%3Dc909%26i%3D7cd6%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D0%26st%3D0%26uid%3D1340714021%26name%3Dguanglingsan1988%2540sina.com%26nick%3Dschumacher%26fmp%3D%26lcp%3D2012-02-02%252019%253A20%253A09    ;SUB   =  _2A257xq12DeRxGedN71IW8SrMyT2IHXVYtZm-rDV8PUNbvtBeLRnGkW9LHet86m9AJ9R6RMhU07ClXHxqCx1S0A..   ;ALF    = 1487147173   ;SSOLoginState   = 1455611174   ;_s_tentry    = login.sina.com.cn   ;UOR   = ,,login.sina.com.cn    ;Apache   =  7716728692915.958.1455611177145 "
     print 'start load cookie'
     #print cookie
     self.downloader = Downloader()#Downloader(cookie)
class UserCrawler(object):
    """
    获取用户个人信息
    """

    def __init__(self):
        #cookie="SINAGLOBAL=3670791019162.063.1432519568807; ULV=1434184446636:3:1:1:9776813265987.371.1434184446576:1432539758675; SUHB=0S7S3YyGl7ABmk; YF-Ugrow-G0=169004153682ef91866609488943c77f; SUS=SID-5513307770-1434867321-GZ-fdui1-417cce02c02cba62afb4b09ce64141b5; SUE=es%3D77e325518a1eeaab4d42c04535d022d9%26ev%3Dv1%26es2%3Dda7c170b38a64fa4d9b6668f496fa074%26rs0%3DzdWWsJgKtTVoMTjEP3CWSLj5LpFJ5UF0%252BWyN6Q8Sd35saJbSk7N2YdacjGPXamqnsYetxrZNNIwMVsz0JNGf%252FkJZ%252FIv1Bh9YQHxwFkUE3K1i7kZDBboUO0yOR%252Fz0Ucw37WwoeeAGM28l5q%252FSbHFjWwe%252F3DJSj1ZdRE59Qrdrt%252Fo%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1434867321%26et%3D1434953721%26d%3Dc909%26i%3D41b5%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D17%26st%3D0%26uid%3D5513307770%26name%3Dmkqtx11141ua%2540163.com%26nick%3D%25E6%25AF%2581%25E9%25A6%2599%25E5%258A%2588%25E5%25BC%25B9%26fmp%3D%26lcp%3D2015-03-20%252000%253A59%253A43; YF-V5-G0=d22a701aae075ca04c11f0ef68835839; _s_tentry=login.sina.com.cn; UOR=,,login.sina.com.cn; Apache=9776813265987.371.1434184446576; YF-Page-G0=8fee13afa53da91ff99fc89cc7829b07; WBStore=0d3077cd0cad2262|undefined; SUB=_2A254giYpDeTxGeNL6lES8CnLzDyIHXVb9hDhrDV8PUJbvNBeLRjHkW8xuQX_wA9ncQZsaP1yWBfuXyq9-w..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5iPVj051o7KP9Hly_f8Jud5JpX5K-t; ALF=1466403316; SSOLoginState=1434867321; wvr=6"
        #cookie = "SINAGLOBAL  =  1791602201041.3557.1455610750935 ;ULV  = 1455611177148:2:2:2:7716728692915.958.1455611177145:1455610750940  ;SUBP  =  0033WrSXqPxfM725Ws9jqgMF55529P9D9WWNbPECqqxh2rppyWaDQBvZ5JpX5KMt  ;SUHB  =  0jBJqq9P-KpPgN  ;un = [email protected] ;wvr  = 6    ;SUS  =  SID-1340714021-1455611173-GZ-b2ey8-468e97b8ca4455bc4ba3beddabec7cd6  ;SUE   = es%3D8484201c133ec33b03f1ed14aa4534fa%26ev%3Dv1%26es2%3D33ba64a44d9ac86cf555cf05bc397327%26rs0%3DM3QtGCbcUToEqKLA6eAZVpMrEX7u4bQVwvi5fHwr4DhrFNaB0594dwFDsL2CZMg5fRLrIkFt3zc9Bx10kzDewhd7AbovJSdm8cKV0c4V1VEfND1YM3XwCaiwZgbhwWc6jXLCbykNpryMLWTdianTFmPUmFrF0%252BazZmYEFLfT7ww%253D%26rv%3D0   ;SUP   = cv%3D1%26bt%3D1455611174%26et%3D1455697574%26d%3Dc909%26i%3D7cd6%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D0%26st%3D0%26uid%3D1340714021%26name%3Dguanglingsan1988%2540sina.com%26nick%3Dschumacher%26fmp%3D%26lcp%3D2012-02-02%252019%253A20%253A09    ;SUB   =  _2A257xq12DeRxGedN71IW8SrMyT2IHXVYtZm-rDV8PUNbvtBeLRnGkW9LHet86m9AJ9R6RMhU07ClXHxqCx1S0A..   ;ALF    = 1487147173   ;SSOLoginState   = 1455611174   ;_s_tentry    = login.sina.com.cn   ;UOR   = ,,login.sina.com.cn    ;Apache   =  7716728692915.958.1455611177145 "
        print 'start load cookie'
        #print cookie
        self.downloader = Downloader()#Downloader(cookie)

    def _process_html(self, content):
        """
        对下载的网页进行预处理,主要是替换\/为/
        """
        if content:
            return content.replace('\\/', '/')
        return ''

    def get_url(self, upage):
        #return 'http://weibo.com/1340714021/fans?cfs=600&relate=fans&t=1&f=1&type=&Pl_Official_RelationFans__106_page=5#Pl_Official_RelationFans__106'
        #return 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=3783364643412551&max_id=3783417797719751&page='+str(upage)
        return 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=3855095853042261&max_id=3856157879315930&page='+str(upage)
    def _init(self, uid):
        """
        初始化用户信息列表
        """
        self.info = {
            'uid': uid,
            'time': '2014-12-02 16:11:00',
            'info_id':'3741311679142648',
            'source_equip':'皮皮时光机',
            'repost_num':1880,
            'fui': [],#一度好友
            '2_fui': []#N度好友
        }

    def _get_followee(self, uid):
        """
        获取user info repost info
        """
        followee_list = []
        repost_list=list()
        followee_set = set()
        for i in range(1,50):#219
            html_data = self.downloader.download(self.get_url(i))
            html_data = self._process_html(html_data)
            #print html_data
            if html_data is not None:
                    try:
                        soup = BeautifulSoup(html_data.decode('utf-8', 'ignore'),"lxml")
                        #followee_html_list= soup.findAll('div')
                        #print followee_html_list
                        followee_html_list=soup.findAll('div',attrs={'class':'list_li S_line1 clearfix'})
			#print followee_html_list
                        for followee_html in followee_html_list:
                            #print followee_html
                            repost_use_list=list()
                            info_connect=followee_html.find('div',attrs={'class':'WB_face W_fl'})
			    #print info_connect
                            if info_connect is None:continue
                            if info_connect.find('img') is None:
                                continue
                            else:                              
                                follow_id=info_connect.find('img')                                
		     	   	print 'repost',follow_id['alt']
                                print 'image',follow_id['src']
                                #followee_list.append((info_connect['usercard'][3:],info_connect['alt']))
			    
                            after_data=followee_html.find('div',attrs={'class':'list_con'})
			    #print after_data
                            repost_2=after_data.find('span')
                            if repost_2.find('a') is not None:
                                repost_n_user=repost_2.findAll('a')
                                for repost_user in repost_n_user:
                                    print '2repost',repost_user.text
                                    if repost_user.text.find('http:')>=0:
                                        continue
                                    repost_use_list.append(repost_user.text)
                                #repost_user=repost_2.find('a').text
                                if len(repost_use_list)>0:
                                    repost_list.append((follow_id['usercard'][3:],repost_use_list))
                            time_user=after_data.find('div',attrs={'class':'WB_from S_txt2'})
                            time_list=time_user.find('a')['title']
                            followee_list.append((follow_id['usercard'][3:],follow_id['alt'],follow_id['src'],time_list))
                    except Exception, e:
                        logging.exception("获取好友列表异常:" + uid + str(e))
        self.info['fui'] = followee_list
        self.info['2_fui']=repost_list
        print self.info