def __init__(self): self.charset = 'utf-8' self.parser = CommonBlogParser() self.downloader = Downloader() # 设置页面url加载的参数 self.http_params = { 'is_search': '0', 'visible': '0', 'is_tag': '0', 'profile_ftype': 1, 'pagebar': '', 'pre_page': '0', 'page': 1 } self.uid = ''
print 'rcc is :' + blogmsg['rcc'] print 'rpt is :' + blogmsg['rpt'] print 'rpage is :' + blogmsg['rpage'] print 'rc is :' + blogmsg['rc'] print 'cc is :' + blogmsg['cc'] print 'page is :' + blogmsg['page'] print 'pt is :' + blogmsg['pt'] print 'srn is :' + blogmsg['srn'] print '======================================' if __name__ == '__main__': import sys, os sys.path.append(os.path.abspath('../')) from toolkit.downloader import Downloader from toolkit.accountlib import AccountAssistant assistant = AccountAssistant() from officeblogparser import OfficeBlogParser parser = OfficeBlogParser() assistant.init() assistant.login() url = 'http://weibo.com/p/1002061649159940/weibo?is_tag=0&is_search=0&pre_page=0&profile_ftype=1&visible=0&pagebar=&page=1' downloader = Downloader() content = downloader.download(url) parser.init_user('1649159940') blog_list = parser.parse(content) #parser.print_blog()
print 'rcc is :' + blogmsg['rcc'] print 'rpt is :' + blogmsg['rpt'] print 'rpage is :' + blogmsg['rpage'] ; print 'rc is :' + blogmsg['rc'] print 'cc is :' + blogmsg['cc'] print 'page is :' + blogmsg['page'] ; print 'pt is :' + blogmsg['pt'] print 'srn is :' + blogmsg['srn'] ; print '======================================' if __name__ == '__main__': import sys, os sys.path.append(os.path.abspath('../')) from toolkit.downloader import Downloader from toolkit.accountlib import AccountAssistant assistant = AccountAssistant() from officeblogparser import OfficeBlogParser parser = OfficeBlogParser() assistant.init() assistant.login() url = 'http://weibo.com/p/1002061649159940/weibo?is_tag=0&is_search=0&pre_page=0&profile_ftype=1&visible=0&pagebar=&page=1' downloader = Downloader() content = downloader.download(url) parser.init_user('1649159940') blog_list = parser.parse(content) #parser.print_blog()
def __init__(self): #cookie="SINAGLOBAL=3670791019162.063.1432519568807; ULV=1434184446636:3:1:1:9776813265987.371.1434184446576:1432539758675; SUHB=0S7S3YyGl7ABmk; YF-Ugrow-G0=169004153682ef91866609488943c77f; SUS=SID-5513307770-1434867321-GZ-fdui1-417cce02c02cba62afb4b09ce64141b5; SUE=es%3D77e325518a1eeaab4d42c04535d022d9%26ev%3Dv1%26es2%3Dda7c170b38a64fa4d9b6668f496fa074%26rs0%3DzdWWsJgKtTVoMTjEP3CWSLj5LpFJ5UF0%252BWyN6Q8Sd35saJbSk7N2YdacjGPXamqnsYetxrZNNIwMVsz0JNGf%252FkJZ%252FIv1Bh9YQHxwFkUE3K1i7kZDBboUO0yOR%252Fz0Ucw37WwoeeAGM28l5q%252FSbHFjWwe%252F3DJSj1ZdRE59Qrdrt%252Fo%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1434867321%26et%3D1434953721%26d%3Dc909%26i%3D41b5%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D17%26st%3D0%26uid%3D5513307770%26name%3Dmkqtx11141ua%2540163.com%26nick%3D%25E6%25AF%2581%25E9%25A6%2599%25E5%258A%2588%25E5%25BC%25B9%26fmp%3D%26lcp%3D2015-03-20%252000%253A59%253A43; YF-V5-G0=d22a701aae075ca04c11f0ef68835839; _s_tentry=login.sina.com.cn; UOR=,,login.sina.com.cn; Apache=9776813265987.371.1434184446576; YF-Page-G0=8fee13afa53da91ff99fc89cc7829b07; WBStore=0d3077cd0cad2262|undefined; SUB=_2A254giYpDeTxGeNL6lES8CnLzDyIHXVb9hDhrDV8PUJbvNBeLRjHkW8xuQX_wA9ncQZsaP1yWBfuXyq9-w..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5iPVj051o7KP9Hly_f8Jud5JpX5K-t; ALF=1466403316; SSOLoginState=1434867321; wvr=6" #cookie = "SINAGLOBAL = 1791602201041.3557.1455610750935 ;ULV = 1455611177148:2:2:2:7716728692915.958.1455611177145:1455610750940 ;SUBP = 0033WrSXqPxfM725Ws9jqgMF55529P9D9WWNbPECqqxh2rppyWaDQBvZ5JpX5KMt ;SUHB = 0jBJqq9P-KpPgN ;un = [email protected] ;wvr = 6 ;SUS = SID-1340714021-1455611173-GZ-b2ey8-468e97b8ca4455bc4ba3beddabec7cd6 ;SUE = es%3D8484201c133ec33b03f1ed14aa4534fa%26ev%3Dv1%26es2%3D33ba64a44d9ac86cf555cf05bc397327%26rs0%3DM3QtGCbcUToEqKLA6eAZVpMrEX7u4bQVwvi5fHwr4DhrFNaB0594dwFDsL2CZMg5fRLrIkFt3zc9Bx10kzDewhd7AbovJSdm8cKV0c4V1VEfND1YM3XwCaiwZgbhwWc6jXLCbykNpryMLWTdianTFmPUmFrF0%252BazZmYEFLfT7ww%253D%26rv%3D0 ;SUP = cv%3D1%26bt%3D1455611174%26et%3D1455697574%26d%3Dc909%26i%3D7cd6%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D0%26st%3D0%26uid%3D1340714021%26name%3Dguanglingsan1988%2540sina.com%26nick%3Dschumacher%26fmp%3D%26lcp%3D2012-02-02%252019%253A20%253A09 ;SUB = _2A257xq12DeRxGedN71IW8SrMyT2IHXVYtZm-rDV8PUNbvtBeLRnGkW9LHet86m9AJ9R6RMhU07ClXHxqCx1S0A.. ;ALF = 1487147173 ;SSOLoginState = 1455611174 ;_s_tentry = login.sina.com.cn ;UOR = ,,login.sina.com.cn ;Apache = 7716728692915.958.1455611177145 " print 'start load cookie' #print cookie self.downloader = Downloader() #Downloader(cookie)
class UserCrawler(object): """ 获取用户个人信息 """ def __init__(self): #cookie="SINAGLOBAL=3670791019162.063.1432519568807; ULV=1434184446636:3:1:1:9776813265987.371.1434184446576:1432539758675; SUHB=0S7S3YyGl7ABmk; YF-Ugrow-G0=169004153682ef91866609488943c77f; SUS=SID-5513307770-1434867321-GZ-fdui1-417cce02c02cba62afb4b09ce64141b5; SUE=es%3D77e325518a1eeaab4d42c04535d022d9%26ev%3Dv1%26es2%3Dda7c170b38a64fa4d9b6668f496fa074%26rs0%3DzdWWsJgKtTVoMTjEP3CWSLj5LpFJ5UF0%252BWyN6Q8Sd35saJbSk7N2YdacjGPXamqnsYetxrZNNIwMVsz0JNGf%252FkJZ%252FIv1Bh9YQHxwFkUE3K1i7kZDBboUO0yOR%252Fz0Ucw37WwoeeAGM28l5q%252FSbHFjWwe%252F3DJSj1ZdRE59Qrdrt%252Fo%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1434867321%26et%3D1434953721%26d%3Dc909%26i%3D41b5%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D17%26st%3D0%26uid%3D5513307770%26name%3Dmkqtx11141ua%2540163.com%26nick%3D%25E6%25AF%2581%25E9%25A6%2599%25E5%258A%2588%25E5%25BC%25B9%26fmp%3D%26lcp%3D2015-03-20%252000%253A59%253A43; YF-V5-G0=d22a701aae075ca04c11f0ef68835839; _s_tentry=login.sina.com.cn; UOR=,,login.sina.com.cn; Apache=9776813265987.371.1434184446576; YF-Page-G0=8fee13afa53da91ff99fc89cc7829b07; WBStore=0d3077cd0cad2262|undefined; SUB=_2A254giYpDeTxGeNL6lES8CnLzDyIHXVb9hDhrDV8PUJbvNBeLRjHkW8xuQX_wA9ncQZsaP1yWBfuXyq9-w..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5iPVj051o7KP9Hly_f8Jud5JpX5K-t; ALF=1466403316; SSOLoginState=1434867321; wvr=6" #cookie = "SINAGLOBAL = 1791602201041.3557.1455610750935 ;ULV = 1455611177148:2:2:2:7716728692915.958.1455611177145:1455610750940 ;SUBP = 0033WrSXqPxfM725Ws9jqgMF55529P9D9WWNbPECqqxh2rppyWaDQBvZ5JpX5KMt ;SUHB = 0jBJqq9P-KpPgN ;un = [email protected] ;wvr = 6 ;SUS = SID-1340714021-1455611173-GZ-b2ey8-468e97b8ca4455bc4ba3beddabec7cd6 ;SUE = es%3D8484201c133ec33b03f1ed14aa4534fa%26ev%3Dv1%26es2%3D33ba64a44d9ac86cf555cf05bc397327%26rs0%3DM3QtGCbcUToEqKLA6eAZVpMrEX7u4bQVwvi5fHwr4DhrFNaB0594dwFDsL2CZMg5fRLrIkFt3zc9Bx10kzDewhd7AbovJSdm8cKV0c4V1VEfND1YM3XwCaiwZgbhwWc6jXLCbykNpryMLWTdianTFmPUmFrF0%252BazZmYEFLfT7ww%253D%26rv%3D0 ;SUP = cv%3D1%26bt%3D1455611174%26et%3D1455697574%26d%3Dc909%26i%3D7cd6%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D0%26st%3D0%26uid%3D1340714021%26name%3Dguanglingsan1988%2540sina.com%26nick%3Dschumacher%26fmp%3D%26lcp%3D2012-02-02%252019%253A20%253A09 ;SUB = _2A257xq12DeRxGedN71IW8SrMyT2IHXVYtZm-rDV8PUNbvtBeLRnGkW9LHet86m9AJ9R6RMhU07ClXHxqCx1S0A.. ;ALF = 1487147173 ;SSOLoginState = 1455611174 ;_s_tentry = login.sina.com.cn ;UOR = ,,login.sina.com.cn ;Apache = 7716728692915.958.1455611177145 " print 'start load cookie' #print cookie self.downloader = Downloader() #Downloader(cookie) def _process_html(self, content): """ 对下载的网页进行预处理,主要是替换\/为/ """ if content: return content.replace('\\/', '/') return '' def get_url(self, upage): #return 'http://weibo.com/1340714021/fans?cfs=600&relate=fans&t=1&f=1&type=&Pl_Official_RelationFans__106_page=5#Pl_Official_RelationFans__106' #return 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=3783364643412551&max_id=3783417797719751&page='+str(upage) return 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=3855095853042261&max_id=3856157879315930&page=' + str( upage) def _init(self, uid): """ 初始化用户信息列表 """ self.info = { 'uid': uid, 'time': '2014-12-02 16:11:00', 'info_id': '3741311679142648', 'source_equip': '皮皮时光机', 'repost_num': 1880, 'fui': [], #一度好友 '2_fui': [] #N度好友 } def _get_followee(self, uid): """ 获取user info repost info """ followee_list = [] repost_list = list() followee_set = set() for i in range(1, 50): #219 html_data = self.downloader.download(self.get_url(i)) html_data = self._process_html(html_data) #print html_data if html_data is not None: try: soup = BeautifulSoup(html_data.decode('utf-8', 'ignore'), "lxml") #followee_html_list= soup.findAll('div') #print followee_html_list followee_html_list = soup.findAll( 'div', attrs={'class': 'list_li S_line1 clearfix'}) #print followee_html_list for followee_html in followee_html_list: #print followee_html repost_use_list = list() info_connect = followee_html.find( 'div', attrs={'class': 'WB_face W_fl'}) #print info_connect if info_connect is None: continue if info_connect.find('img') is None: continue else: follow_id = info_connect.find('img') print 'repost', follow_id['alt'] print 'image', follow_id['src'] #followee_list.append((info_connect['usercard'][3:],info_connect['alt'])) after_data = followee_html.find( 'div', attrs={'class': 'list_con'}) #print after_data repost_2 = after_data.find('span') if repost_2.find('a') is not None: repost_n_user = repost_2.findAll('a') for repost_user in repost_n_user: print '2repost', repost_user.text if repost_user.text.find('http:') >= 0: continue repost_use_list.append(repost_user.text) #repost_user=repost_2.find('a').text if len(repost_use_list) > 0: repost_list.append((follow_id['usercard'][3:], repost_use_list)) time_user = after_data.find( 'div', attrs={'class': 'WB_from S_txt2'}) time_list = time_user.find('a')['title'] followee_list.append( (follow_id['usercard'][3:], follow_id['alt'], follow_id['src'], time_list)) except Exception, e: logging.exception("获取好友列表异常:" + uid + str(e)) self.info['fui'] = followee_list self.info['2_fui'] = repost_list print self.info
def __init__(self): cookie="SINAGLOBAL=3670791019162.063.1432519568807; ULV=1434184446636:3:1:1:9776813265987.371.1434184446576:1432539758675; SUHB=0S7S3YyGl7ABmk; YF-Ugrow-G0=169004153682ef91866609488943c77f; SUS=SID-5513307770-1434867321-GZ-fdui1-417cce02c02cba62afb4b09ce64141b5; SUE=es%3D77e325518a1eeaab4d42c04535d022d9%26ev%3Dv1%26es2%3Dda7c170b38a64fa4d9b6668f496fa074%26rs0%3DzdWWsJgKtTVoMTjEP3CWSLj5LpFJ5UF0%252BWyN6Q8Sd35saJbSk7N2YdacjGPXamqnsYetxrZNNIwMVsz0JNGf%252FkJZ%252FIv1Bh9YQHxwFkUE3K1i7kZDBboUO0yOR%252Fz0Ucw37WwoeeAGM28l5q%252FSbHFjWwe%252F3DJSj1ZdRE59Qrdrt%252Fo%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1434867321%26et%3D1434953721%26d%3Dc909%26i%3D41b5%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D17%26st%3D0%26uid%3D5513307770%26name%3Dmkqtx11141ua%2540163.com%26nick%3D%25E6%25AF%2581%25E9%25A6%2599%25E5%258A%2588%25E5%25BC%25B9%26fmp%3D%26lcp%3D2015-03-20%252000%253A59%253A43; YF-V5-G0=d22a701aae075ca04c11f0ef68835839; _s_tentry=login.sina.com.cn; UOR=,,login.sina.com.cn; Apache=9776813265987.371.1434184446576; YF-Page-G0=8fee13afa53da91ff99fc89cc7829b07; WBStore=0d3077cd0cad2262|undefined; SUB=_2A254giYpDeTxGeNL6lES8CnLzDyIHXVb9hDhrDV8PUJbvNBeLRjHkW8xuQX_wA9ncQZsaP1yWBfuXyq9-w..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5iPVj051o7KP9Hly_f8Jud5JpX5K-t; ALF=1466403316; SSOLoginState=1434867321; wvr=6" print 'start load cookie' print cookie self.downloader = Downloader(cookie)
class UserCrawler(object): """ 获取用户个人信息 """ def __init__(self): cookie="SINAGLOBAL=3670791019162.063.1432519568807; ULV=1434184446636:3:1:1:9776813265987.371.1434184446576:1432539758675; SUHB=0S7S3YyGl7ABmk; YF-Ugrow-G0=169004153682ef91866609488943c77f; SUS=SID-5513307770-1434867321-GZ-fdui1-417cce02c02cba62afb4b09ce64141b5; SUE=es%3D77e325518a1eeaab4d42c04535d022d9%26ev%3Dv1%26es2%3Dda7c170b38a64fa4d9b6668f496fa074%26rs0%3DzdWWsJgKtTVoMTjEP3CWSLj5LpFJ5UF0%252BWyN6Q8Sd35saJbSk7N2YdacjGPXamqnsYetxrZNNIwMVsz0JNGf%252FkJZ%252FIv1Bh9YQHxwFkUE3K1i7kZDBboUO0yOR%252Fz0Ucw37WwoeeAGM28l5q%252FSbHFjWwe%252F3DJSj1ZdRE59Qrdrt%252Fo%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1434867321%26et%3D1434953721%26d%3Dc909%26i%3D41b5%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D17%26st%3D0%26uid%3D5513307770%26name%3Dmkqtx11141ua%2540163.com%26nick%3D%25E6%25AF%2581%25E9%25A6%2599%25E5%258A%2588%25E5%25BC%25B9%26fmp%3D%26lcp%3D2015-03-20%252000%253A59%253A43; YF-V5-G0=d22a701aae075ca04c11f0ef68835839; _s_tentry=login.sina.com.cn; UOR=,,login.sina.com.cn; Apache=9776813265987.371.1434184446576; YF-Page-G0=8fee13afa53da91ff99fc89cc7829b07; WBStore=0d3077cd0cad2262|undefined; SUB=_2A254giYpDeTxGeNL6lES8CnLzDyIHXVb9hDhrDV8PUJbvNBeLRjHkW8xuQX_wA9ncQZsaP1yWBfuXyq9-w..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5iPVj051o7KP9Hly_f8Jud5JpX5K-t; ALF=1466403316; SSOLoginState=1434867321; wvr=6" print 'start load cookie' print cookie self.downloader = Downloader(cookie) def _process_html(self, content): """ 对下载的网页进行预处理,主要是替换\/为/ """ if content: return content.replace('\\/', '/') return '' def get_url(self, upage): #return 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=3783364643412551&max_id=3783417797719751&page='+str(upage) return 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=3855095853042261&max_id=3856157879315930&page='+str(upage) def _init(self, uid): """ 初始化用户信息列表 """ self.info = { 'uid': uid, 'time': '2015-08-18 16:11:00', 'info_id':'3741311679142648', 'source_equip':'皮皮时光机', 'repost_num':1880, 'fui': [],#一度好友 '2_fui': []#N度好友 } def _get_followee(self, uid): """ 获取user info repost info """ followee_list = [] repost_list=list() followee_set = set() for i in range(1,219): html_data = self.downloader.download(self.get_url(i)) html_data = self._process_html(html_data) #print html_data if html_data is not None: try: soup = BeautifulSoup(html_data.decode('utf-8', 'ignore')) #followee_html_list= soup.findAll('div') #print followee_html_list followee_html_list=soup.findAll('div',attrs={'class':'list_li S_line1 clearfix'}) #print followee_html_list for followee_html in followee_html_list: #print followee_html repost_use_list=list() info_connect=followee_html.find('div',attrs={'class':'WB_face W_fl'}) #print info_connect if info_connect is None:continue if info_connect.find('img') is None: continue else: follow_id=info_connect.find('img') print 'repost',follow_id['alt'] # username print 'image',follow_id['src'] # image url #followee_list.append((info_connect['usercard'][3:],info_connect['alt'])) after_data=followee_html.find('div',attrs={'class':'list_con'}) #print after_data repost_2=after_data.find('span') if repost_2.find('a') is not None: repost_n_user=repost_2.findAll('a') for repost_user in repost_n_user: print '2repost',repost_user.text if repost_user.text.find('http:')>=0: continue repost_use_list.append(repost_user.text) #repost_user=repost_2.find('a').text if len(repost_use_list)>0: repost_list.append((follow_id['usercard'][3:],repost_use_list)) time_user=after_data.find('div',attrs={'class':'WB_from S_txt2'}) time_list=time_user.find('a')['title'] followee_list.append((follow_id['usercard'][3:],follow_id['alt'],follow_id['src'],time_list)) except Exception, e: logging.exception("获取好友列表异常:" + uid + str(e)) self.info['fui'] = followee_list self.info['2_fui']=repost_list print self.info
class BlogCrawler(object): def __init__(self): self.charset = 'utf-8' self.parser = CommonBlogParser() self.downloader = Downloader() # 设置页面url加载的参数 self.http_params = { 'is_search': '0', 'visible': '0', 'is_tag': '0', 'profile_ftype': 1, 'pagebar': '', 'pre_page': '0', 'page': 1 } self.uid = '' # ========= 完成解析用户微博数据前的准备工作 ========# def _init_(self, url): """ 解析用户微博数据前的准备工作,包括: 1. 获取当前用户的page_id 2. 获取当前用户的微博总页数 """ http_params = { '__rnd': '', '_k': '', '_t': '0', 'count': '15', 'end_id': '', 'max_id': '', 'page': 1, 'pagebar': '', 'pre_page': '0', 'profile_ftype': '1', 'uid': '' } content = self.downloader.download(url) # 判断用户是否存在 if not self.exist(content): raise UserNotFoundError(url) # 获取用户ID btag = "$CONFIG['oid']='" etag = "';" bpos = content.find(btag) + len(btag) epos = content.find(etag, bpos) uid = content[bpos:epos] self.uid = uid # 获取 page_id self.page_id = self._parse_pageid(content) # 获取微博总页数 self.pagenum = self._caculate_pagenum(content) # 获取pid,抓取微博所需的domain参数 self.pid = self._parse_pid(content) # 获取用户头像地址和昵称 img_url, nick_name = self._parse_userinfo(content) self.parser.init_user(self.uid, img_url, nick_name) self.url = self.get_url() def exist(self, content): """ 判断当前用户是否存在 ------------------------------ return: 用户存在返回True,否则返回False """ if content.find('<title>错误提示') != -1: return False return True def _parse_pageid(self, content): """ 解析页面的page_id ---------------------------------- content: 待解析的网页内容 ---------------------------------- return: page_id, 或空 """ btag = "$CONFIG['page_id']='" etag = "'" page_id = '' if content: bpos = content.find(btag) if bpos: bpos += len(btag) epos = content.find(etag, bpos) page_id = content[bpos:epos] return page_id def _caculate_pagenum(self, content): """ 计算微博的总页数 ------------------------------ return: 微博页数 """ msgcount = self._parse_msgcount(content) per_pagenum = 45 total_pagenum = msgcount / per_pagenum if msgcount % per_pagenum: total_pagenum += 1 return total_pagenum def _parse_msgcount(self, content): """ 解析微博条数 --------------------------- content: 网页文本 --------------------------- return: 微博条数 """ if not content: raise PreprocessError(self.uid) etag1 = '>微博<\/span>' etag2 = '<\/strong>' btag = '>' epos = content.find(etag1) epos = content[:epos].rfind(etag2) bpos = content[:epos].rfind(btag) + len(btag) return int(content[bpos:epos]) def _parse_userinfo(self, content): """ 解析用户的头像地址/用户昵称 ----------------------------- content: 网页文本 ------------------------------ return: (img_url, nick_name) """ btag = '<div class="pf_photo"' etag = '<\/div>' bpos = content.find(btag) epos = content.find(etag, bpos) soup = BeautifulSoup(content[bpos:epos].replace('\\/', '/') + '</div>') img_url = soup.img['src'] nick_name = soup.img['alt'] return img_url, nick_name #======== 解析用户的微博数据,并保存结果 =======# def scratch(self, url, start_pageindex=1): """ 获取给定用户的所有满足条件的微博,并写入文件 ---------------------------------------- uid: 待抓取微博数据的用户ID start_pageindex: 从第几页开始抓取用户的微博数据 """ self._init_(url) from controller import Controller # print Controller._get_filepath(self.uid) if os.path.isfile(Controller._get_filepath(self.uid)): # 用户微博已下载 print self.uid, u'用户的微博已下载!' return None if start_pageindex > self.pagenum: return [] #return self._binary_scratch(uid, start_pageindex) return self._sequence_scratch(self.uid, start_pageindex, self.pagenum) def _binary_scratch(self, uid, start_pageindex): """ 执行二分式的抓取策略,从页码的中间偏后的位置开始抓取。 现在从总页面数的三分之一处开始抓取数据。 ---------------------------------------------- start_pageindex: 起始页码 --------------------------------------------- return: blogs """ mid_pageindex = max((start_pageindex + self.pagenum) / 3, 1) # 从前往后抓取微博 blogs1 = self._sequence_scratch(uid, mid_pageindex, self.pagenum, 1) # 从后往前抓取微博 if mid_pageindex > start_pageindex: blogs2 = self._sequence_scratch(uid, mid_pageindex - 1, start_pageindex, -1) blogs1.extend(blogs2) return blogs1 def _sequence_scratch(self, uid, start_pageindex, end_pageindex, direction=1): """ 执行顺序抓取策略,按照顺序从前往后或者从后往前抓取 --------------------------------------------------- uid: 待抓取的用户ID start_pageindex: 起始页码 end_pageindex: 结束页面 direction: 抓取的方向, 1->从前往后,pageindex递增;-1->从后往前,pageindex递减 --------------------------------------------------- return: blogs """ blogs = [] for pageindex in range(start_pageindex, end_pageindex + direction, direction): temp_blogs = self._parse_blogs(pageindex) print uid + ':获取第' + str(pageindex) + '页微博成功.' blogs.extend(temp_blogs) time.sleep(1) if not self._continue(temp_blogs, direction): break return blogs def _parse_blogs(self, pageindex): """ 获取指定微博页面的三个子页的微博内容 ----------------------------------- return: 该页的微博列表 """ blogs = [] self.http_params['page'] = pageindex self.http_params['id'] = self.page_id self.http_params['domain'] = self.pid # 下载第一页 self.http_params['pre_page'] = self.http_params['page'] - 1 content = self.downloader.download(self.url, self.http_params) if content: sub_blogs = self.parser.parse(content) blogs.extend(sub_blogs) if not self._continue(blogs): return blogs # 下载第二页 self.http_params['count'] = '15' self.http_params['pagebar'] = '0' self.http_params['pre_page'] = self.http_params['page'] content = self.downloader.download(self.url, self.http_params) if content: sub_blogs = self.parser.parse(content) blogs.extend(sub_blogs) if not self._continue(sub_blogs): return blogs # 下载第三页 self.http_params['count'] = '15' self.http_params['pagebar'] = '1' self.http_params['pre_page'] = self.http_params['page'] content = self.downloader.download(self.url, self.http_params) if content: sub_blogs = self.parser.parse(content) blogs.extend(sub_blogs) return blogs def _continue(self, blogs, direction=1): """ 判断是否需要继续进行下载任务 ----------------------------- blogs: 待判断的博客列表,按照时间先后顺序排列 direction: 判别的方向,1->判定最后一条微博是否比起始时间早; -1->判定第一条微博是否比结束时间晚; ------------------------------ return: 继续返回True,否则返回False """ is_continue = True if blogs: if (direction == -1 and dt.compare(blogs[0]['pt'], config.end_time) > 0) or \ (direction == 1 and dt.compare(blogs[-1]['pt'], config.begin_time) < 0): is_continue = False return is_continue def get_url(self): """ 获取下载的网页地址 """ # url = 'http://weibo.com/p/' + self.page_id + '/weibo' url = 'http://weibo.com/p/aj/v6/mblog/mbloglist' return url def _parse_pid(self, content): btag = "$CONFIG['pid']='" etag = "'" pid = '' if content: bpos = content.find(btag) if bpos: bpos += len(btag) epos = content.find(etag, bpos) pid = content[bpos:epos] return pid
def __init__(self): #cookie="SINAGLOBAL=3670791019162.063.1432519568807; ULV=1434184446636:3:1:1:9776813265987.371.1434184446576:1432539758675; SUHB=0S7S3YyGl7ABmk; YF-Ugrow-G0=169004153682ef91866609488943c77f; SUS=SID-5513307770-1434867321-GZ-fdui1-417cce02c02cba62afb4b09ce64141b5; SUE=es%3D77e325518a1eeaab4d42c04535d022d9%26ev%3Dv1%26es2%3Dda7c170b38a64fa4d9b6668f496fa074%26rs0%3DzdWWsJgKtTVoMTjEP3CWSLj5LpFJ5UF0%252BWyN6Q8Sd35saJbSk7N2YdacjGPXamqnsYetxrZNNIwMVsz0JNGf%252FkJZ%252FIv1Bh9YQHxwFkUE3K1i7kZDBboUO0yOR%252Fz0Ucw37WwoeeAGM28l5q%252FSbHFjWwe%252F3DJSj1ZdRE59Qrdrt%252Fo%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1434867321%26et%3D1434953721%26d%3Dc909%26i%3D41b5%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D17%26st%3D0%26uid%3D5513307770%26name%3Dmkqtx11141ua%2540163.com%26nick%3D%25E6%25AF%2581%25E9%25A6%2599%25E5%258A%2588%25E5%25BC%25B9%26fmp%3D%26lcp%3D2015-03-20%252000%253A59%253A43; YF-V5-G0=d22a701aae075ca04c11f0ef68835839; _s_tentry=login.sina.com.cn; UOR=,,login.sina.com.cn; Apache=9776813265987.371.1434184446576; YF-Page-G0=8fee13afa53da91ff99fc89cc7829b07; WBStore=0d3077cd0cad2262|undefined; SUB=_2A254giYpDeTxGeNL6lES8CnLzDyIHXVb9hDhrDV8PUJbvNBeLRjHkW8xuQX_wA9ncQZsaP1yWBfuXyq9-w..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5iPVj051o7KP9Hly_f8Jud5JpX5K-t; ALF=1466403316; SSOLoginState=1434867321; wvr=6" #cookie = "SINAGLOBAL = 1791602201041.3557.1455610750935 ;ULV = 1455611177148:2:2:2:7716728692915.958.1455611177145:1455610750940 ;SUBP = 0033WrSXqPxfM725Ws9jqgMF55529P9D9WWNbPECqqxh2rppyWaDQBvZ5JpX5KMt ;SUHB = 0jBJqq9P-KpPgN ;un = [email protected] ;wvr = 6 ;SUS = SID-1340714021-1455611173-GZ-b2ey8-468e97b8ca4455bc4ba3beddabec7cd6 ;SUE = es%3D8484201c133ec33b03f1ed14aa4534fa%26ev%3Dv1%26es2%3D33ba64a44d9ac86cf555cf05bc397327%26rs0%3DM3QtGCbcUToEqKLA6eAZVpMrEX7u4bQVwvi5fHwr4DhrFNaB0594dwFDsL2CZMg5fRLrIkFt3zc9Bx10kzDewhd7AbovJSdm8cKV0c4V1VEfND1YM3XwCaiwZgbhwWc6jXLCbykNpryMLWTdianTFmPUmFrF0%252BazZmYEFLfT7ww%253D%26rv%3D0 ;SUP = cv%3D1%26bt%3D1455611174%26et%3D1455697574%26d%3Dc909%26i%3D7cd6%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D0%26st%3D0%26uid%3D1340714021%26name%3Dguanglingsan1988%2540sina.com%26nick%3Dschumacher%26fmp%3D%26lcp%3D2012-02-02%252019%253A20%253A09 ;SUB = _2A257xq12DeRxGedN71IW8SrMyT2IHXVYtZm-rDV8PUNbvtBeLRnGkW9LHet86m9AJ9R6RMhU07ClXHxqCx1S0A.. ;ALF = 1487147173 ;SSOLoginState = 1455611174 ;_s_tentry = login.sina.com.cn ;UOR = ,,login.sina.com.cn ;Apache = 7716728692915.958.1455611177145 " print 'start load cookie' #print cookie self.downloader = Downloader()#Downloader(cookie)
class UserCrawler(object): """ 获取用户个人信息 """ def __init__(self): #cookie="SINAGLOBAL=3670791019162.063.1432519568807; ULV=1434184446636:3:1:1:9776813265987.371.1434184446576:1432539758675; SUHB=0S7S3YyGl7ABmk; YF-Ugrow-G0=169004153682ef91866609488943c77f; SUS=SID-5513307770-1434867321-GZ-fdui1-417cce02c02cba62afb4b09ce64141b5; SUE=es%3D77e325518a1eeaab4d42c04535d022d9%26ev%3Dv1%26es2%3Dda7c170b38a64fa4d9b6668f496fa074%26rs0%3DzdWWsJgKtTVoMTjEP3CWSLj5LpFJ5UF0%252BWyN6Q8Sd35saJbSk7N2YdacjGPXamqnsYetxrZNNIwMVsz0JNGf%252FkJZ%252FIv1Bh9YQHxwFkUE3K1i7kZDBboUO0yOR%252Fz0Ucw37WwoeeAGM28l5q%252FSbHFjWwe%252F3DJSj1ZdRE59Qrdrt%252Fo%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1434867321%26et%3D1434953721%26d%3Dc909%26i%3D41b5%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D17%26st%3D0%26uid%3D5513307770%26name%3Dmkqtx11141ua%2540163.com%26nick%3D%25E6%25AF%2581%25E9%25A6%2599%25E5%258A%2588%25E5%25BC%25B9%26fmp%3D%26lcp%3D2015-03-20%252000%253A59%253A43; YF-V5-G0=d22a701aae075ca04c11f0ef68835839; _s_tentry=login.sina.com.cn; UOR=,,login.sina.com.cn; Apache=9776813265987.371.1434184446576; YF-Page-G0=8fee13afa53da91ff99fc89cc7829b07; WBStore=0d3077cd0cad2262|undefined; SUB=_2A254giYpDeTxGeNL6lES8CnLzDyIHXVb9hDhrDV8PUJbvNBeLRjHkW8xuQX_wA9ncQZsaP1yWBfuXyq9-w..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5iPVj051o7KP9Hly_f8Jud5JpX5K-t; ALF=1466403316; SSOLoginState=1434867321; wvr=6" #cookie = "SINAGLOBAL = 1791602201041.3557.1455610750935 ;ULV = 1455611177148:2:2:2:7716728692915.958.1455611177145:1455610750940 ;SUBP = 0033WrSXqPxfM725Ws9jqgMF55529P9D9WWNbPECqqxh2rppyWaDQBvZ5JpX5KMt ;SUHB = 0jBJqq9P-KpPgN ;un = [email protected] ;wvr = 6 ;SUS = SID-1340714021-1455611173-GZ-b2ey8-468e97b8ca4455bc4ba3beddabec7cd6 ;SUE = es%3D8484201c133ec33b03f1ed14aa4534fa%26ev%3Dv1%26es2%3D33ba64a44d9ac86cf555cf05bc397327%26rs0%3DM3QtGCbcUToEqKLA6eAZVpMrEX7u4bQVwvi5fHwr4DhrFNaB0594dwFDsL2CZMg5fRLrIkFt3zc9Bx10kzDewhd7AbovJSdm8cKV0c4V1VEfND1YM3XwCaiwZgbhwWc6jXLCbykNpryMLWTdianTFmPUmFrF0%252BazZmYEFLfT7ww%253D%26rv%3D0 ;SUP = cv%3D1%26bt%3D1455611174%26et%3D1455697574%26d%3Dc909%26i%3D7cd6%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D0%26st%3D0%26uid%3D1340714021%26name%3Dguanglingsan1988%2540sina.com%26nick%3Dschumacher%26fmp%3D%26lcp%3D2012-02-02%252019%253A20%253A09 ;SUB = _2A257xq12DeRxGedN71IW8SrMyT2IHXVYtZm-rDV8PUNbvtBeLRnGkW9LHet86m9AJ9R6RMhU07ClXHxqCx1S0A.. ;ALF = 1487147173 ;SSOLoginState = 1455611174 ;_s_tentry = login.sina.com.cn ;UOR = ,,login.sina.com.cn ;Apache = 7716728692915.958.1455611177145 " print 'start load cookie' #print cookie self.downloader = Downloader()#Downloader(cookie) def _process_html(self, content): """ 对下载的网页进行预处理,主要是替换\/为/ """ if content: return content.replace('\\/', '/') return '' def get_url(self, upage): #return 'http://weibo.com/1340714021/fans?cfs=600&relate=fans&t=1&f=1&type=&Pl_Official_RelationFans__106_page=5#Pl_Official_RelationFans__106' #return 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=3783364643412551&max_id=3783417797719751&page='+str(upage) return 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=3855095853042261&max_id=3856157879315930&page='+str(upage) def _init(self, uid): """ 初始化用户信息列表 """ self.info = { 'uid': uid, 'time': '2014-12-02 16:11:00', 'info_id':'3741311679142648', 'source_equip':'皮皮时光机', 'repost_num':1880, 'fui': [],#一度好友 '2_fui': []#N度好友 } def _get_followee(self, uid): """ 获取user info repost info """ followee_list = [] repost_list=list() followee_set = set() for i in range(1,50):#219 html_data = self.downloader.download(self.get_url(i)) html_data = self._process_html(html_data) #print html_data if html_data is not None: try: soup = BeautifulSoup(html_data.decode('utf-8', 'ignore'),"lxml") #followee_html_list= soup.findAll('div') #print followee_html_list followee_html_list=soup.findAll('div',attrs={'class':'list_li S_line1 clearfix'}) #print followee_html_list for followee_html in followee_html_list: #print followee_html repost_use_list=list() info_connect=followee_html.find('div',attrs={'class':'WB_face W_fl'}) #print info_connect if info_connect is None:continue if info_connect.find('img') is None: continue else: follow_id=info_connect.find('img') print 'repost',follow_id['alt'] print 'image',follow_id['src'] #followee_list.append((info_connect['usercard'][3:],info_connect['alt'])) after_data=followee_html.find('div',attrs={'class':'list_con'}) #print after_data repost_2=after_data.find('span') if repost_2.find('a') is not None: repost_n_user=repost_2.findAll('a') for repost_user in repost_n_user: print '2repost',repost_user.text if repost_user.text.find('http:')>=0: continue repost_use_list.append(repost_user.text) #repost_user=repost_2.find('a').text if len(repost_use_list)>0: repost_list.append((follow_id['usercard'][3:],repost_use_list)) time_user=after_data.find('div',attrs={'class':'WB_from S_txt2'}) time_list=time_user.find('a')['title'] followee_list.append((follow_id['usercard'][3:],follow_id['alt'],follow_id['src'],time_list)) except Exception, e: logging.exception("获取好友列表异常:" + uid + str(e)) self.info['fui'] = followee_list self.info['2_fui']=repost_list print self.info