def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = self.opener.browse_open(url) soup = BeautifulSoup(br.response().read()) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() html = None is_follow = True for script in soup.find_all('script'): text = script.text if 'STK' in text: text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) if data['pid'] == 'pl_relation_hisFollow' or \ data['pid'] == 'pl_relation_hisFans': html = BeautifulSoup(data['html']) if data['pid'] == 'pl_relation_hisFans': is_follow = False bundles = [] ul = html.find(attrs={'class': 'cnfList', 'node-type': 'userListBox'}) if ul is None: return [], bundles for li in ul.find_all(attrs={'class': 'S_line1', 'action-type': 'itemClick'}): data = dict([l.split('=') for l in li['action-data'].split('&')]) friend = Friend() friend.uid = data['uid'] friend.nickname = data['fnick'] friend.sex = True if data['sex'] == u'm' else False bundles.append(WeiboUserBundle(str(friend.uid))) if is_follow: weibo_user.follows.append(friend) else: weibo_user.fans.append(friend) weibo_user.save() urls = [] pages = html.find('div', attrs={'class': 'W_pages', 'node-type': 'pageList'}) if pages is not None: a = pages.find_all('a') if len(a) > 0: next_ = a[-1] if next_['class'] == ['W_btn_c']: url = '%s?page=%s' % ( url.split('?')[0], (int(urldecode(url).get('page', 1))+1)) urls.append(url) return urls, bundles
class UserFriendParser(WeiboParser): def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url try: br = self.opener.browse_open(url) except Exception as e: print(e) print('休息10分钟!') time.sleep(60 * 10) # self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return weibo_user = self.get_weibo_user() html = None decodes = urldecode(url) is_follow = True is_new_mode = False is_banned = True for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): if is_banned: is_banned = False text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid.startswith('Pl_Official_LeftHisRelation__') or \ domid.startswith('Pl_Official_HisRelation__'): html = beautiful_soup(data['html']) if 'relate' in decodes and decodes['relate'] == 'fans': is_follow = False is_new_mode = True elif 'STK' in text: if is_banned: is_banned = False text = text.replace( 'STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) if data['pid'] == 'pl_relation_hisFollow' or \ data['pid'] == 'pl_relation_hisFans': html = beautiful_soup(data['html']) if data['pid'] == 'pl_relation_hisFans': is_follow = False if is_banned: print('休息10分钟!') time.sleep(60 * 10) raise FetchBannedError('fetch banned by weibo server') ul = None try: ul = html.find(attrs={ 'class': 'cnfList', 'node-type': 'userListBox' }) if ul is None: ul = html.find(attrs={ 'class': 'follow_list', 'node-type': 'userListBox' }) except AttributeError, e: print('休息10分钟!') time.sleep(60 * 10) if br.geturl().startswith('http://e.weibo.com'): return raise e if ul is None: if is_follow is True: if is_new_mode: yield 'http://weibo.com/%s/follow?relate=fans' % self.uid else: yield 'http://weibo.com/%s/fans' % self.uid return current_page = decodes.get('page', 1) if current_page == 1: if is_follow: weibo_user.follows = [] else: weibo_user.fans = [] for cls in ('S_line1', 'S_line2'): for li in ul.find_all(attrs={ 'class': cls, 'action-type': 'itemClick' }): data = dict( [l.split('=') for l in li['action-data'].split('&')]) friend = Friend() friend.uid = data['uid'] friend.nickname = data['fnick'] friend.sex = True if data['sex'] == u'm' else False yield WeiboUserBundle(str(friend.uid)) if is_follow: weibo_user.follows.append(friend) else: weibo_user.fans.append(friend) weibo_user.save() # self.logger.debug('parse %s finish' % url) # counter add one for the friend url counter_type = 'follows' if is_follow else 'fans' self.counter.inc('processed_%s_list_page' % counter_type, 1) pages = html.find('div', attrs={ 'class': 'W_pages', 'node-type': 'pageList' }) if pages is None: pages = html.find('div', attrs={ 'class': 'WB_cardpage', 'node-type': 'pageList' }) if pages is not None: a = pages.find_all('a') if len(a) > 0: next_ = a[-1] if next_['class'] == ['W_btn_c'] or 'next' in next_['class']: decodes['page'] = int(decodes.get('page', 1)) + 1 query_str = urllib.urlencode(decodes) url = '%s?%s' % (url.split('?')[0], query_str) yield url return if is_follow is True: if is_new_mode: yield 'http://weibo.com/%s/follow?relate=fans' % self.uid else: yield 'http://weibo.com/%s/fans' % self.uid
if is_new_mode: urls.append("http://weibo.com/%s/follow?relate=fans" % self.uid) else: urls.append("http://weibo.com/%s/fans" % self.uid) return urls, bundles current_page = decodes.get("page", 1) if current_page == 1: if is_follow: weibo_user.follows = [] else: weibo_user.fans = [] for li in ul.find_all(attrs={"class": "S_line1", "action-type": "itemClick"}): data = dict([l.split("=") for l in li["action-data"].split("&")]) friend = Friend() friend.uid = data["uid"] friend.nickname = data["fnick"] friend.sex = True if data["sex"] == u"m" else False bundles.append(WeiboUserBundle(str(friend.uid))) if is_follow: weibo_user.follows.append(friend) else: weibo_user.fans.append(friend) weibo_user.save() self.logger.debug("parse %s finish" % url) urls = [] pages = html.find("div", attrs={"class": "W_pages", "node-type": "pageList"})
urls.append('http://weibo.com/%s/fans' % self.uid) return urls, bundles current_page = decodes.get('page', 1) if current_page == 1: if is_follow: weibo_user.follows = [] else: weibo_user.fans = [] for li in ul.find_all(attrs={ 'class': 'S_line1', 'action-type': 'itemClick' }): data = dict([l.split('=') for l in li['action-data'].split('&')]) friend = Friend() friend.uid = data['uid'] friend.nickname = data['fnick'] friend.sex = True if data['sex'] == u'm' else False bundles.append(WeiboUserBundle(str(friend.uid))) if is_follow: weibo_user.follows.append(friend) else: weibo_user.fans.append(friend) weibo_user.save() self.logger.debug('parse %s finish' % url) urls = [] pages = html.find('div',
if is_new_mode: urls.append('http://weibo.com/%s/follow?relate=fans' % self.uid) else: urls.append('http://weibo.com/%s/fans' % self.uid) return urls, bundles current_page = decodes.get('page', 1) if current_page == 1: if is_follow: weibo_user.follows = [] else: weibo_user.fans = [] for li in ul.find_all(attrs={'class': 'S_line1', 'action-type': 'itemClick'}): data = dict([l.split('=') for l in li['action-data'].split('&')]) friend = Friend() friend.uid = data['uid'] friend.nickname = data['fnick'] friend.sex = True if data['sex'] == u'm' else False bundles.append(WeiboUserBundle(str(friend.uid))) if is_follow: weibo_user.follows.append(friend) else: weibo_user.fans.append(friend) weibo_user.save() self.logger.debug('parse %s finish' % url) urls = [] pages = html.find('div', attrs={'class': 'W_pages', 'node-type': 'pageList'})
current_page = decodes.get('page', 1) if current_page == 1: if is_follow: weibo_user.follows = [] else: weibo_user.fans = [] # # span = ul.find(attrs={'class': 'addr'}) # # for li in ul.find_all(attrs={'class': 'S_line1', 'action-type': 'itemClick'}): data = dict([l.split('=') for l in li['action-data'].split('&')]) friend = Friend() #friend.uid = data['uid'] #friend.nickname = data['fnick'] #friend.sex = True if data['sex'] == u'm' else False tempuid = data['uid'] tempnickname = data['fnick'] tempsex = True if data['sex'] == u'm' else False # #location # locationok = False #span = ul.find(str(tempuid)).find(attrs={'class': 'addr'}) for j in span.stripped_strings: templocation = repr(j) templocation=eval(templocation) #print templocation
if is_new_mode: urls.append('http://weibo.com/%s/follow?relate=fans' % self.uid) else: urls.append('http://weibo.com/%s/fans' % self.uid) return urls, bundles current_page = decodes.get('page', 1) if current_page == 1: if is_follow: weibo_user.follows = [] else: weibo_user.fans = [] for li in ul.find_all(attrs={'class': 'S_line2', 'action-type': 'itemClick'}): data = dict([l.split('=') for l in li['action-data'].split('&')]) friend = Friend() friend.uid = data['uid'] friend.nickname = data['fnick'] friend.sex = True if data['sex'] == u'm' else False links = li.find('div', attrs={'class': 'info_connect'}).find_all('a') if len(links) == 3: friend.n_follows = int(links[0].text) friend.n_fans = int(links[1].text) friend.n_weibos = int(links[2].text) #bundles.append(WeiboUserBundle(str(friend.uid))) if is_follow: bundles.append(WeiboUserBundle(str(friend.uid), level=MAX_LEVEL)) weibo_user.follows.append(friend) else:
def parse(self, url=None): if self.bundle.exists == False: return url = url or self.url br = self.opener.browse_open(url) soup = BeautifulSoup(br.response().read()) if not self.check(url, br): return weibo_user = self.get_weibo_user() html = None is_follow = True for script in soup.find_all('script'): text = script.text if 'STK' in text: text = text.replace( 'STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) if data['pid'] == 'pl_relation_hisFollow' or \ data['pid'] == 'pl_relation_hisFans': html = BeautifulSoup(data['html']) if data['pid'] == 'pl_relation_hisFans': is_follow = False bundles = [] ul = html.find(attrs={'class': 'cnfList', 'node-type': 'userListBox'}) for li in ul.find_all(attrs={ 'class': 'S_line1', 'action-type': 'itemClick' }): data = dict([l.split('=') for l in li['action-data'].split('&')]) friend = Friend() friend.uid = data['uid'] friend.nickname = data['fnick'] friend.sex = True if data['sex'] == u'm' else False bundles.append(WeiboUserBundle(str(friend.uid))) if is_follow: weibo_user.follows.append(friend) else: weibo_user.fans.append(friend) weibo_user.save() urls = [] pages = html.find('div', attrs={ 'class': 'W_pages', 'node-type': 'pageList' }) if pages is not None: a = pages.find_all('a') if len(a) > 0: next_ = a[-1] if next_['class'] == ['W_btn_c']: url = next_['href'] if not url.startswith('http://'): url = urlparse.urljoin('http://weibo.com', url) urls.append(url) return urls, bundles
current_page = decodes.get('page', 1) if current_page == 1: if is_follow: weibo_user.follows = [] else: weibo_user.fans = [] urls = [] for li in ul.find_all(attrs={'class': 'S_line1'}): data = dict([l.split('=') for l in li['action-data'].split('&')]) location_span = li.find('span',attrs={'class': 'addr'}) location = location_span.text.strip() friend = Friend() friend.uid = data['uid'] friend.nickname = data['fnick'] if data['sex'] == u'm': friend.sex = u'男' else: friend.sex = u'女' friend.location = location weibo_map = { u'关注': {'field': 'follow_num'}, u'粉丝': {'field': 'fans_num'}, u'微博': {'field': 'weibo_num'}, } connect_div = li.find('div',attrs={'class': 'connect'}) connect = connect_div.text.replace(' ',' ').replace('\t','').replace('\n','').strip()