def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid == 'Pl_Official_LeftInfo__13': info_soup = beautiful_soup(data['html']) info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'}) for block_div in info_div.find_all( 'div', attrs={'class': 'infoblock'}): block_title = block_div.find('form').text.strip() if block_title == u'基本信息': profile_div = block_div elif block_title == u'工作信息': career_div = block_div elif block_title == u'教育信息': edu_div = block_div elif block_title == u'标签信息': tags_div = block_div elif domid == 'Pl_Official_Header__1': header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\ .find('img')['src'] elif 'STK' in text: text = text.replace( 'STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) pid = data['pid'] if pid == 'pl_profile_infoBase': profile_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoCareer': career_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoEdu': edu_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoTag': tags_div = beautiful_soup(data['html']) elif pid == 'pl_profile_photo': soup = beautiful_soup(data['html']) weibo_user.info.avatar = soup.find('img')['src'] profile_map = { u'昵称': { 'field': 'nickname' }, u'所在地': { 'field': 'location' }, u'性别': { 'field': 'sex', 'func': lambda s: True if s == u'男' else False }, u'生日': { 'field': 'birth' }, u'博客': { 'field': 'blog' }, u'个性域名': { 'field': 'site' }, u'简介': { 'field': 'intro' }, u'邮箱': { 'field': 'email' }, u'QQ': { 'field': 'qq' }, u'MSN': { 'field': 'msn' } } if profile_div is not None: for div in profile_div.find_all(attrs={'class': 'pf_item'}): k = div.find(attrs={'class': 'label'}).text.strip() v = div.find(attrs={'class': 'con'}).text.strip() if k in profile_map: if k == u'个性域名' and '|' in v: v = v.split('|')[1].strip() func = (lambda s: s) \ if 'func' not in profile_map[k] \ else profile_map[k]['func'] v = func(v) setattr(weibo_user.info, profile_map[k]['field'], v) weibo_user.info.work = [] if career_div is not None: for div in career_div.find_all(attrs={'class': 'con'}): work_info = WorkInfo() ps = div.find_all('p') for p in ps: a = p.find('a') if a is not None: work_info.name = a.text text = p.text if '(' in text: work_info.date = text.strip().split('(')[1].strip( ')') else: text = p.text if text.startswith(u'地区:'): work_info.location = text.split(u':', 1)[1] elif text.startswith(u'职位:'): work_info.position = text.split(u':', 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) weibo_user.info.edu = [] if edu_div is not None: for div in edu_div.find_all(attrs={'class': 'con'}): edu_info = EduInfo() ps = div.find_all('p') for p in ps: a = p.find('a') text = p.text if a is not None: edu_info.name = a.text if '(' in text: edu_info.date = text.strip().split('(')[1].strip( ')') else: edu_info.detail = text weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] if tags_div is not None: for div in tags_div.find_all(attrs={'class': 'con'}): for a in div.find_all('a'): weibo_user.info.tags.append(a.text) weibo_user.save() self.logger.debug('parse %s finish' % url) return [], []
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url try: br = self.opener.browse_open(url) except Exception as e: print(e) print('休息10分钟!') time.sleep(60 * 10) # self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() new_style = False profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid.startswith('Pl_Official_LeftInfo__'): info_soup = beautiful_soup(data['html']) info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'}) for block_div in info_div.find_all( 'div', attrs={'class': 'infoblock'}): block_title = block_div.find('form').text.strip() if block_title == u'基本信息': profile_div = block_div elif block_title == u'工作信息': career_div = block_div elif block_title == u'教育信息': edu_div = block_div elif block_title == u'标签信息': tags_div = block_div elif domid.startswith('Pl_Official_PersonalInfo__'): new_style = True info_soup = beautiful_soup(data['html']) for block_div in info_soup.find_all( 'div', attrs={'class': 'WB_cardwrap'}): block_title_div = block_div.find( 'h4', attrs={'class': 'obj_name'}) if block_title_div is None: block_title_div = block_div.find('div', attrs={'class': 'obj_name'})\ .find('h2') if block_title_div is None: continue block_title = block_title_div.text.strip() inner_div = block_div.find( 'div', attrs={'class': 'WB_innerwrap'}) if block_title == u'基本信息': profile_div = inner_div elif block_title == u'工作信息': career_div = inner_div elif block_title == u'教育信息': edu_div = inner_div elif block_title == u'标签信息': tags_div = inner_div elif domid == 'Pl_Official_Header__1': header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\ .find('img')['src'] weibo_user.info.n_follows = int(header_soup.find('ul', attrs={'class': 'user_atten'})\ .find('strong', attrs={'node-type': 'follow'}).text) weibo_user.info.n_fans = int(header_soup.find('ul', attrs={'class': 'user_atten'})\ .find('strong', attrs={'node-type': 'fans'}).text) elif domid.startswith('Pl_Core_T8CustomTriColumn__'): # new style friends info header_soup = beautiful_soup(data['html']) tds = header_soup.find('table', attrs={'class': 'tb_counter'})\ .find_all('td') weibo_user.info.n_follows = int(tds[0].find('strong').text) weibo_user.info.n_fans = int(tds[1].find('strong').text) elif domid.startswith('Pl_Official_Headerv6__'): # new style avatar info header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('p', attrs='photo_wrap')\ .find('img')['src'] elif 'STK' in text: text = text.replace( 'STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) pid = data['pid'] if pid == 'pl_profile_infoBase': profile_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoCareer': career_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoEdu': edu_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoTag': tags_div = beautiful_soup(data['html']) elif pid == 'pl_profile_photo': soup = beautiful_soup(data['html']) weibo_user.info.avatar = soup.find('img')['src'] profile_map = { u'昵称': { 'field': 'nickname' }, u'所在地': { 'field': 'location' }, u'性别': { 'field': 'sex', 'func': lambda s: True if s == u'男' else False }, u'生日': { 'field': 'birth' }, u'博客': { 'field': 'blog' }, u'个性域名': { 'field': 'site' }, u'简介': { 'field': 'intro' }, u'邮箱': { 'field': 'email' }, u'QQ': { 'field': 'qq' }, u'MSN': { 'field': 'msn' } } if profile_div is not None: if not new_style: divs = profile_div.find_all(attrs={'class': 'pf_item'}) else: divs = profile_div.find_all('li', attrs={'class': 'li_1'}) for div in divs: if not new_style: k = div.find(attrs={'class': 'label'}).text.strip() v = div.find(attrs={'class': 'con'}).text.strip() else: k = div.find('span', attrs={ 'class': 'pt_title' }).text.strip().strip(u':') d = div.find('span', attrs={'class': 'pt_detail'}) if d: v = d.text.strip() else: v = div.find('a').text.strip() if k in profile_map: if k == u'个性域名' and '|' in v: v = v.split('|')[1].strip() func = (lambda s: s) \ if 'func' not in profile_map[k] \ else profile_map[k]['func'] v = func(v) setattr(weibo_user.info, profile_map[k]['field'], v) weibo_user.info.work = [] if career_div is not None: if not new_style: for div in career_div.find_all(attrs={'class': 'con'}): work_info = WorkInfo() ps = div.find_all('p') for p in ps: a = p.find('a') if a is not None: work_info.name = a.text text = p.text if '(' in text: work_info.date = text.strip().split( '(')[1].strip(')') else: text = p.text if text.startswith(u'地区:'): work_info.location = text.split(u':', 1)[1] elif text.startswith(u'职位:'): work_info.position = text.split(u':', 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) else: li = career_div.find('li', attrs={'class': 'li_1'}) for span in li.find_all('span', attrs={'class': 'pt_detail'}): work_info = WorkInfo() text = span.text a = span.find('a') if a is not None: work_info.name = a.text if '(' in text: work_info.date = text.strip().split('(')[1]\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .split(')', 1)[0] for l in text.split('\r\n'): l = l.strip() if len(l) == 0: continue if l.startswith(u'地区:'): work_info.location = l.split(u':', 1)[1] elif l.startswith(u'职位:'): work_info.position = l.split(u':', 1)[1] else: work_info.detail = text.replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .strip() weibo_user.info.work.append(work_info) weibo_user.info.edu = [] if edu_div is not None: if not new_style: for div in edu_div.find_all(attrs={'class': 'con'}): edu_info = EduInfo() ps = div.find_all('p') for p in ps: a = p.find('a') text = p.text if a is not None: edu_info.name = a.text if '(' in text: edu_info.date = text.strip().split( '(')[1].strip().strip(')') else: edu_info.detail = text weibo_user.info.edu.append(edu_info) else: span = edu_div.find('li', attrs={'class': 'li_1'})\ .find('span', attrs={'class': 'pt_detail'}) text = span.text names = [] for a in span.find_all('a'): names.append(a.text) for idx, name in enumerate(names): start_pos = text.find(name) + len(name) if idx < len(names) - 1: end_pos = text.find(names[idx + 1], start_pos) else: end_pos = len(text) t = text[start_pos:end_pos] edu_info = EduInfo() edu_info.name = name if '(' in text: edu_info.date = t.strip().split('(')[1]\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .split(')', 1)[0] t = t[t.find(')') + 1:] text = text[end_pos:] edu_info.detail = t.replace('\r', '').replace('\n', '')\ .replace('\t', '').strip() weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] if tags_div is not None: if not new_style: for div in tags_div.find_all(attrs={'class': 'con'}): for a in div.find_all('a'): weibo_user.info.tags.append(a.text) else: for a in tags_div.find('span', attrs={ 'class': 'pt_detail' }).find_all('a'): weibo_user.info.tags.append(a.text.strip()) weibo_user.save() # self.logger.debug('parse %s finish' % url) # counter add one for the profile url self.counter.inc('processed_profile_page', 1)
class UserHomePageParser(WeiboParser): def extract_user_info(self, soup, weibo_user): div_pi = soup.find('div', attrs={'class', 'PCD_person_info'}) # verfiy cop bs_verify = div_pi.find('a', attrs={'class': 'icon_verify_co_v'}) weibo_user.info.is_person = False if bs_verify else True # vip person bs_vip = div_pi.find('a', attrs={'class': 'icon_verify_v'}) weibo_user.info.vip = True if bs_vip else False weibo_user.info.verified = True if bs_verify or bs_vip else False weibo_user.info.level = int( div_pi.find('a', attrs={ 'class': 'W_icon_level' }).text.split('.')[1]) def extract_user_counter(self, soup, weibo_user): # msg counter tds = soup.find('table', attrs={'class': 'tb_counter'}).find_all('td') if tds: weibo_user.info.n_follows = int(tds[0].find('strong').text) weibo_user.info.n_fans = int(tds[1].find('strong').text) weibo_user.info.n_msgs = int(tds[2].find('strong').text) def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url html = '' opener = None try: #if hasattr(self.opener,'nalbr'): # opener = self.opener.nalbr # no account login browser #else: # opener = MechanizeOpener(timeout=10,user_agent=user_config.conf.opener.user_agent) # p_ = get_ip_proxy() # self.logger.info(p_) # opener.add_proxy(p_,'http') # self.opener.nalbr = opener opener = self.opener opener.addheaders = [('User-Agent', user_config.conf.opener.user_agent)] html = to_unicode(opener.open(url, timeout=10)) opener.browser.clear_history() # resolve memory issue except Exception, ex: if opener: opener.browser.close() raise Exception("get banned on user page") if not html: return soup = beautiful_soup(html) weibo_user = self.get_weibo_user() if weibo_user.info is None: weibo_user.info = UserInfo() # find page_id try: pid_ = re.findall("CONFIG\['page_id'\]='(.*)';", html)[0] except: if opener: opener.browser.close() if hasattr(self.opener, 'nalbr'): del self.opener.nalbr raise FetchBannedError("get banned on user page") domain_ = re.findall("CONFIG\['domain'\]='(.*)';", html)[0] for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid.startswith("Pl_Core_UserInfo"): header_soup = beautiful_soup(data['html']) self.extract_user_info(header_soup, weibo_user) elif domid.startswith("Pl_Official_Header"): header_soup = beautiful_soup(data['html']) # nickname nickname_ = header_soup.find( 'div', attrs={'class', 'pf_username'}).text elif domid.startswith("Pl_Core_T8CustomTriColumn"): header_soup = beautiful_soup(data['html']) self.extract_user_counter(header_soup, weibo_user) self.bundle.pid = pid_ self.bundle.domain = domain_ weibo_user.pid = pid_ weibo_user.info.domain = domain_ weibo_user.info.nickname = nickname_.strip() weibo_user.save() # counter add one for the processed user home list url self.counter.inc('processed_weibo_user_home_page', 1) time.sleep(1) if fetch_userprofile and weibo_user.info.is_person and not weibo_user.info.location: yield 'http://weibo.com/p/%s/info' % pid_
def save_blog_detail(self,div,mblog): content_div = div.find('p', attrs={'node-type': 'feed_list_content'}) mblog.content = content_div.text blog_create_date = parse(div.find('a',attrs={'node-type':'feed_list_item_date'})['title']) mblog.created = blog_create_date mblog.last_update = datetime.now() is_forward = div.get('isforward') if is_forward: # write origional user, msg mblog.omid = div['omid'] tbinfos = div['tbinfo'].split('&') mblog.ouid = tbinfos[0].split('=')[1] name_a = div.find('a', attrs={ 'class': 'WB_name', 'node-type': 'feed_list_originNick' }) text_a = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_reason' }) if name_a is not None and text_a is not None: mblog.forward = '%s: %s' % (name_a.text, text_a.text) func_div = div.find_all('div', attrs={'class':'feed_action'})[-1] action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t) likes = func_div.find('a', attrs={'action-type': action_type_re("like")}).find('em') if likes: likes = likes.text.strip('(').strip(')').replace(',','') likes = int(likes) if likes and unicode.isdigit(likes) else 0 mblog.n_likes = likes forwards = func_div.find('a', attrs={'action-type': action_type_re("forward")}).find('em') if forwards: forwards = forwards.text.strip('(').strip(')').replace(',','') mblog.n_forwards = int(forwards) if forwards and unicode.isdigit(forwards) else 0 comments = func_div.find('a', attrs={'action-type': action_type_re('comment')}).find('em') if comments: comments = comments.text.strip('(').strip(')').replace(',','') mblog.n_comments = int(comments) if comments and unicode.isdigit(comments) else 0 # parse uid a = func_div.find('a',attrs={'action-type':'feed_list_forward'})['action-data'] u = urllib_parse.unquote(a[a.find('url='):]) qs = urllib_parse.parse_qs(u) if not qs.has_key('uid'): print(qs) mblog.uid = qs['uid'][0] # save user weibo_user = self.get_weibo_user(mblog.uid) if not (weibo_user.info and weibo_user.info.nickname): if qs.has_key('pid'): weibo_user.pid = qs['pid'][0] if weibo_user.info is None: weibo_user.info = UserInfo() weibo_user.info.nickname = qs['name'][0] weibo_user.save() # has_video div_video = div.find('div',attrs={'node-type':'fl_h5_video_disp'}) or div.find('span',attrs={'class':'icon_playvideo'}) mblog.has_video = True if div_video else False mblog.save() return (weibo_user,mblog)
class UserHomePageParser(WeiboParser): def extract_user_info(self,soup,weibo_user): div_pi = soup.find('div',attrs={'class','PCD_person_info'}) # verfiy cop bs_verify = div_pi.find('a',attrs={'class':'icon_verify_co_v'}) weibo_user.info.is_person = False if bs_verify else True # vip person bs_vip = div_pi.find('a',attrs={'class':'icon_verify_v'}) weibo_user.info.vip = True if bs_vip else False weibo_user.info.verified = True if bs_verify or bs_vip else False weibo_user.info.level = int(div_pi.find('a',attrs={'class':'W_icon_level'}).text.split('.')[1]) def extract_user_counter(self,soup,weibo_user): # msg counter tds = soup.find('table', attrs={'class': 'tb_counter'}).find_all('td') if tds: weibo_user.info.n_follows = int(tds[0].find('strong').text) weibo_user.info.n_fans = int(tds[1].find('strong').text) weibo_user.info.n_msgs = int(tds[2].find('strong').text) def parse(self, url = None): url = url or self.url html = '' opener = self.opener try: opener.addheaders = [('User-Agent',user_config.conf.opener.user_agent)] html = to_unicode(opener.open(url,timeout=10)) opener.browser.clear_history() # resolve memory issue except Exception, ex: if opener: opener.browser.close() raise Exception("get banned on user page") try: uid = re.findall("CONFIG\['oid'\]='(.*)';",html)[0] except: raise FetchBannedError("get banned on blog page") soup = beautiful_soup(html) weibo_user = self.get_weibo_user(uid) if weibo_user.info is None: weibo_user.info = UserInfo() # find page_id try: pid_ = re.findall("CONFIG\['page_id'\]='(.*)';",html)[0] except: raise FetchBannedError("get banned on user page") domain_ = re.findall("CONFIG\['domain'\]='(.*)';",html)[0] for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid.startswith("Pl_Core_UserInfo") and data.has_key('html'): header_soup = beautiful_soup(data['html']) self.extract_user_info(header_soup,weibo_user) elif domid.startswith("Pl_Official_Header"): header_soup = beautiful_soup(data['html']) # nickname nickname_ = header_soup.find('div',attrs={'class','pf_username'}).text elif domid.startswith("Pl_Core_T8CustomTriColumn") and data.has_key('html'): header_soup = beautiful_soup(data['html']) self.extract_user_counter(header_soup,weibo_user) weibo_user.pid = pid_ weibo_user.info.domain = domain_ weibo_user.info.nickname = nickname_.strip() weibo_user.save() # counter add one for the processed user home list url self.counter.inc('processed_weibo_user_home_page', 1)
def parse(self, url=None): if self.bundle.exists == False: return url = url or self.url br = self.opener.browse_open(url) soup = BeautifulSoup(br.response().read()) if not self.check(url, br): return weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all('script'): text = script.text if 'STK' in text: text = text.replace( 'STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) pid = data['pid'] if pid == 'pl_profile_infoBase': profile_div = BeautifulSoup(data['html']) elif pid == 'pl_profile_infoCareer': career_div = BeautifulSoup(data['html']) elif pid == 'pl_profile_infoEdu': edu_div = BeautifulSoup(data['html']) elif pid == 'pl_profile_infoTag': tags_div = BeautifulSoup(data['html']) profile_map = { u'昵称': { 'field': 'nickname' }, u'所在地': { 'field': 'location' }, u'性别': { 'field': 'sex', 'func': lambda s: True if s == u'男' else False }, u'生日': { 'field': 'birth' }, u'博客': { 'field': 'blog' }, u'个性域名': { 'field': 'site' }, u'简介': { 'field': 'intro' }, u'邮箱': { 'field': 'email' }, u'QQ': { 'field': 'qq' }, u'MSN': { 'field': 'msn' } } for div in profile_div.find_all(attrs={'class': 'pf_item'}): k = div.find(attrs={'class': 'label'}).text.strip() v = div.find(attrs={'class': 'con'}).text.strip() if k in profile_map: func = (lambda s: s) \ if 'func' not in profile_map[k] \ else profile_map[k]['func'] v = func(v) setattr(weibo_user.info, profile_map[k]['field'], v) weibo_user.info.work = [] for div in career_div.find_all(attrs={'class': 'con'}): work_info = WorkInfo() ps = div.find_all('p') for p in ps: a = p.find('a') if a is not None: work_info.name = a.text text = p.text if '(' in text: work_info.date = text.strip().split('(')[1].strip(')') else: text = p.text if text.startswith(u'地区:'): work_info.location = text.split(':', 1)[1] elif text.startswith(u'职位:'): work_info.position = text.split(':', 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) weibo_user.info.edu = [] for div in edu_div.find_all(attrs={'class': 'con'}): edu_info = EduInfo() ps = div.find_all('p') for p in ps: a = p.find('a') text = p.text if a is not None: edu_info.name = a.text if '(' in text: edu_info.date = text.strip().split('(')[1].strip(')') else: edu_info.detail = text weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] for div in tags_div.find_all(attrs={'class': 'con'}): for a in div.find_all('a'): weibo_user.info.tags.append(a.text) weibo_user.save() return [], []
def parse(self, url=None): url = url or self.url try: br = self.opener.browse_open(url) html = br.response().read() if not self.check(url, br): return self.uid = re.findall("CONFIG\['oid'\]='(.*)';", html)[0] except: raise FetchBannedError("get banned on blog page") weibo_user = self.get_weibo_user(self.uid) info = weibo_user.info if info is None: weibo_user.info = UserInfo() soup = beautiful_soup(html) new_style = False profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid.startswith('Pl_Official_LeftInfo__'): info_soup = beautiful_soup(data['html']) info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'}) for block_div in info_div.find_all( 'div', attrs={'class': 'infoblock'}): block_title = block_div.find('form').text.strip() if block_title == u'基本信息': profile_div = block_div elif block_title == u'工作信息': career_div = block_div elif block_title == u'教育信息': edu_div = block_div elif block_title == u'标签信息': tags_div = block_div elif domid.startswith('Pl_Official_PersonalInfo__'): new_style = True info_soup = beautiful_soup(data['html']) for block_div in info_soup.find_all( 'div', attrs={'class': 'WB_cardwrap'}): block_title_div = block_div.find( 'h4', attrs={'class': 'obj_name'}) if block_title_div is None: block_title_div = block_div.find('div', attrs={'class': 'obj_name'})\ .find('h2') if block_title_div is None: continue block_title = block_title_div.text.strip() inner_div = block_div.find( 'div', attrs={'class': 'WB_innerwrap'}) if block_title == u'基本信息': profile_div = inner_div elif block_title == u'工作信息': career_div = inner_div elif block_title == u'教育信息': edu_div = inner_div elif block_title == u'标签信息': tags_div = inner_div elif domid == 'Pl_Official_Header__1' and data.has_key('html'): header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\ .find('img')['src'] weibo_user.info.n_follows = int(header_soup.find('ul', attrs={'class': 'user_atten'})\ .find('strong', attrs={'node-type': 'follow'}).text) weibo_user.info.n_fans = int(header_soup.find('ul', attrs={'class': 'user_atten'})\ .find('strong', attrs={'node-type': 'fans'}).text) elif domid.startswith('Pl_Core_T8CustomTriColumn__' ) and data.has_key('html'): # new style friends info header_soup = beautiful_soup(data['html']) tds = header_soup.find('table', attrs={'class': 'tb_counter'})\ .find_all('td') weibo_user.info.n_follows = int(tds[0].find('strong').text) weibo_user.info.n_fans = int(tds[1].find('strong').text) elif domid.startswith('Pl_Official_Headerv6__'): # new style avatar info header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('p', attrs='photo_wrap')\ .find('img')['src'] bs_verified = header_soup.find( 'a', attrs={ "suda-data": "key=pc_apply_entry&value=feed_icon" }) weibo_user.info.verified = True if bs_verified else False bs_vip = header_soup.find( 'a', attrs={ "suda-uatrack": "key=home_vip&value=home_feed_vip" }) weibo_user.info.vip = True if bs_vip else False weibo_user.info.pf_intro = header_soup.find('div', attrs={ 'class': 'pf_intro' }).text elif domid.startswith('Pl_Official_RightGrowNew'): header_soup = beautiful_soup(data['html']) weibo_user.info.level_score = int( header_soup.find('p', attrs={ 'class': 'level_info' }).find_all('span', attrs={'class': 'S_txt1'})[1].text.strip()) weibo_user.info.level = int( header_soup.find('p', attrs={ 'class': 'level_info' }).find_all('span', attrs={'class': 'S_txt1' })[0].text.strip().split('.')[1]) elif 'STK' in text: text = text.replace( 'STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) pid = data['pid'] if pid == 'pl_profile_infoBase': profile_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoCareer': career_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoEdu': edu_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoTag': tags_div = beautiful_soup(data['html']) elif pid == 'pl_profile_photo': soup = beautiful_soup(data['html']) weibo_user.info.avatar = soup.find('img')['src'] profile_map = { u'昵称': { 'field': 'nickname' }, u'所在地': { 'field': 'location' }, u'性别': { 'field': 'gender' }, u'生日': { 'field': 'birth', 'func': lambda v: datetime.strptime( v.replace(u'年', '/').replace(u'月', '/').replace(u'日', ''), '%Y/%m/%d') if re.match(u'\d+年\d+月\d+日', v) else None }, u'博客': { 'field': 'blog' }, u'个性域名': { 'field': 'site' }, u'简介': { 'field': 'intro' }, u'邮箱': { 'field': 'email' }, u'QQ': { 'field': 'qq' }, u'MSN': { 'field': 'msn' }, u'注册时间': { 'field': 'register_date' } } if profile_div is not None: if not new_style: divs = profile_div.find_all(attrs={'class': 'pf_item'}) else: divs = profile_div.find_all('li', attrs={'class': 'li_1'}) for div in divs: if not new_style: k = div.find(attrs={'class': 'label'}).text.strip() v = div.find(attrs={'class': 'con'}).text.strip() else: k = div.find('span', attrs={ 'class': 'pt_title' }).text.strip().strip(u':') d = div.find('span', attrs={'class': 'pt_detail'}) if d: v = d.text.strip() else: v = div.find('a').text.strip() if k in profile_map: if k == u'个性域名' and '|' in v: v = v.split('|')[1].strip() func = (lambda s: s) \ if 'func' not in profile_map[k] \ else profile_map[k]['func'] try: v = func(v) except: v = None setattr(weibo_user.info, profile_map[k]['field'], v) weibo_user.info.work = [] if career_div is not None: if not new_style: for div in career_div.find_all(attrs={'class': 'con'}): work_info = WorkInfo() ps = div.find_all('p') for p in ps: a = p.find('a') if a is not None: work_info.name = a.text text = p.text if '(' in text: work_info.date = text.strip().split( '(')[1].strip(')') else: text = p.text if text.startswith(u'地区:'): work_info.location = text.split(u':', 1)[1] elif text.startswith(u'职位:'): work_info.position = text.split(u':', 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) else: li = career_div.find('li', attrs={'class': 'li_1'}) for span in li.find_all('span', attrs={'class': 'pt_detail'}): work_info = WorkInfo() text = span.text a = span.find('a') if a is not None: work_info.name = a.text if '(' in text: work_info.date = text.strip().split('(')[1]\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .split(')', 1)[0] for l in text.split('\r\n'): l = l.strip() if len(l) == 0: continue if l.startswith(u'地区:'): work_info.location = l.split(u':', 1)[1] elif l.startswith(u'职位:'): work_info.position = l.split(u':', 1)[1] else: work_info.detail = text.replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .strip() weibo_user.info.work.append(work_info) weibo_user.info.edu = [] if edu_div is not None: if not new_style: for div in edu_div.find_all(attrs={'class': 'con'}): edu_info = EduInfo() ps = div.find_all('p') for p in ps: a = p.find('a') text = p.text if a is not None: edu_info.name = a.text if '(' in text: edu_info.date = text.strip().split( '(')[1].strip().strip(')') else: edu_info.detail = text weibo_user.info.edu.append(edu_info) else: span = edu_div.find('li', attrs={'class': 'li_1'})\ .find('span', attrs={'class': 'pt_detail'}) text = span.text names = [] for a in span.find_all('a'): names.append(a.text) for idx, name in enumerate(names): start_pos = text.find(name) + len(name) if idx < len(names) - 1: end_pos = text.find(names[idx + 1], start_pos) else: end_pos = len(text) t = text[start_pos:end_pos] edu_info = EduInfo() edu_info.name = name if '(' in text: edu_info.date = t.strip().split('(')[1]\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .split(')', 1)[0] t = t[t.find(')') + 1:] text = text[end_pos:] edu_info.detail = t.replace('\r', '').replace('\n', '')\ .replace('\t', '').strip() weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] if tags_div is not None: if not new_style: for div in tags_div.find_all(attrs={'class': 'con'}): for a in div.find_all('a'): weibo_user.info.tags.append(a.text) else: for a in tags_div.find('span', attrs={ 'class': 'pt_detail' }).find_all('a'): weibo_user.info.tags.append(a.text.strip()) weibo_user.save() # counter add one for the profile url self.counter.inc('processed_profile_page', 1)