def parse_org_info(self, response): """ 抽取组织主页的信息 """ request_list = self.make_request_list() for request in request_list: self.crawler.stats.inc_value('request_issued') yield request user_info_item = UserInfoItem() user_info_item['is_org'] = True user_info_item['uid'] = response.meta['uid'] #import ipdb; ipdb.set_trace() # find user name # 发生了跳转,则意味着该帐号有个性域名 m = re_UserNamePage.search(response.url) if m != None: user_info_item['username'] = user_info_item['uid'] else: m = re_Site.search(response.url) user_info_item['username'] = m.group(1) soup = beautiful_soup(response.body) for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid.startswith('Pl_Core_T8CustomTriColumn__'): info_soup = beautiful_soup(data['html']) td_list = info_soup.find_all('td', attrs = {'class':'S_line1'}) if len(td_list) != 3: log.msg('Error parsing: %s' % response.url, log.INFO) else: user_info_item['n_follows'] = int(td_list[0].find('strong').text.strip()) user_info_item['n_fans'] = int(td_list[1].find('strong').text.strip()) elif domid.startswith('Pl_Core_UserInfo__'): info_soup = beautiful_soup(data['html']) user_info_item['nickname'] = info_soup.find('p', attrs={'class':'info'}).find('span').text.strip() ul_list = info_soup.find('ul', attrs={'class':'ul_detail'}) li_list = ul_list.find_all('li', attrs={'class':'item S_line2 clearfix'}) if len(li_list) == 0: log.msg('Error parsing: %s' % response.url, log.INFO) else: user_info_item['category'] = li_list[0].find('span', attrs={'class':'item_text W_fl'}).text.strip() if len(li_list) > 1: user_info_item['intro'] = li_list[1].find('span', attrs={'class':'item_text W_fl'}).text.strip() else: pass #print user_info_item #import ipdb; ipdb.set_trace() yield user_info_item
def store_info(country=None): """Fetch metadata for all stores""" browse_page = utils.beautiful_soup('http://www.bricklink.com/browse.asp') country_links = ( browse_page .find(text='Stores:').parent.parent.next_sibling .find_all('a', href=re.compile('countryID')) ) result = [] for country_link in country_links: country_name = country_link.text country_id = utils.get_params(country_link['href'])['countryID'] # skip this country link if we're only gathering data on one country if country is not None and country_id != country: continue country_page = utils.beautiful_soup('http://www.bricklink.com' + country_link['href']) store_links = country_page.find_all('a', href=re.compile('store.asp')) for store_link in store_links: store_page = utils.beautiful_soup('http://www.bricklink.com' + '/' + store_link['href']) params = utils.get_params(store_page.find('frame', src=re.compile('^storeTop.asp'))['src']) store_name = params['storeName'] store_id = params['uID'] country_name = params['cn'] country_id = params['c'] seller_name = params['p_seller'] feedback = params['p_feedback'] store_splash = utils.beautiful_soup("http://www.bricklink.com/storeSplash.asp?uID=" + store_id) min_buy_elem = store_splash.find(text="Minimum Buy:") if min_buy_elem is not None: min_buy = min_buy_elem.parent.parent.parent.parent.next_sibling.find("font").text try: min_buy = re.search("US \$([0-9.]+)", min_buy).group(1) min_buy = float(min_buy) except AttributeError: # there's a minimum buy in a foreign currency :( continue else: min_buy = 0.0 ships_to_elem = store_splash.find(text="Store Ships To:") if ships_to_elem is not None: ships = ships_to_elem.parent.parent.parent.parent.next_sibling.find_all(text=True) ships = map(lambda x: unicode(x), ships) else: ships = [] entry = { 'store_name': store_name, 'store_id': int(store_id), 'country_name': country_name, 'country_id': country_id, 'seller_name': seller_name, 'feedback': int(feedback), 'minimum_buy': min_buy, 'ships': ships } print entry result.append(entry) return result
self.body['pre_page'] = self.body['page'] page_url = page_url + urllib.urlencode(self.body) elif page == 3: self.body['pagebar'] = '1' self.body['pre_page'] = self.body['page'] page_url = page_url + urllib.urlencode(self.body) try: #print page_url req = urllib2.Request(page_url) req.add_header('User-Agent', self.headers) res = urllib2.urlopen(req) except urllib2.HTTPError, e: weiboMain.log.error('++++Open url [%s] failed. (Error code: %d) <@parser.get_weibo>++++' % (page_url, e.code)) return soup = beautiful_soup(res.read()) if soup is None: return 0 for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid.startswith('Pl_Official_LeftProfileFeed__'): page_soup = beautiful_soup(data['html']) if page_soup is None: weiboMain.log.error('++++Can not get weibo feed! <@parser.get_weibo>++++') return 0 weibo_list = page_soup.find('div', attrs = {'class': 'WB_feed'})
def store_info(country=None): """Fetch metadata for all stores""" browse_page = utils.beautiful_soup('http://www.bricklink.com/browse.asp') country_links = (browse_page.find( text='Stores:').parent.parent.next_sibling.find_all( 'a', href=re.compile('countryID'))) result = [] for country_link in country_links: country_name = country_link.text country_id = utils.get_params(country_link['href'])['countryID'] # skip this country link if we're only gathering data on one country if country is not None and country_id != country: continue country_page = utils.beautiful_soup('http://www.bricklink.com' + country_link['href']) store_links = country_page.find_all('a', href=re.compile('store.asp')) for store_link in store_links: store_page = utils.beautiful_soup('http://www.bricklink.com' + '/' + store_link['href']) params = utils.get_params( store_page.find('frame', src=re.compile('^storeTop.asp'))['src']) store_name = params['storeName'] store_id = params['uID'] country_name = params['cn'] country_id = params['c'] seller_name = params['p_seller'] feedback = params['p_feedback'] store_splash = utils.beautiful_soup( "http://www.bricklink.com/storeSplash.asp?uID=" + store_id) min_buy_elem = store_splash.find(text="Minimum Buy:") if min_buy_elem is not None: min_buy = min_buy_elem.parent.parent.parent.parent.next_sibling.find( "font").text try: min_buy = re.search("US \$([0-9.]+)", min_buy).group(1) min_buy = float(min_buy) except AttributeError: # there's a minimum buy in a foreign currency :( continue else: min_buy = 0.0 ships_to_elem = store_splash.find(text="Store Ships To:") if ships_to_elem is not None: ships = ships_to_elem.parent.parent.parent.parent.next_sibling.find_all( text=True) ships = map(lambda x: unicode(x), ships) else: ships = [] entry = { 'store_name': store_name, 'store_id': int(store_id), 'country_name': country_name, 'country_id': country_id, 'seller_name': seller_name, 'feedback': int(feedback), 'minimum_buy': min_buy, 'ships': ships } print entry result.append(entry) return result
def parse_user_info(self, response): # 默认的回调函数 """ 普通用户信息抓取,例如: """ #import ipdb; ipdb.set_trace() # 添加接下来要处理的request request_list = self.make_request_list() for request in request_list: self.crawler.stats.inc_value('request_issued') yield request is_valid = True # 判别是否是合法的item log.msg('Parse url: %s' % response.url, level=log.INFO) #log.msg('Response body: %s' % response.body) # 如果是组织页面,那么会返回错误页面 if response.url.find('pagenotfound') > 0: # 匹配源request #import ipdb; ipdb.set_trace() is_valid = False log.msg('Page not found: %s' % response.url, log.ERROR) uid = response.meta['uid'] new_url = 'http://weibo.com/u/%s' % (uid) # 直接访问组织帐号的首页 request = Request(url=new_url, callback=self.parse_org_info, meta={'uid': uid}) self.crawler.stats.inc_value('request_issued') yield request # TODO: 判断用户是否存在, http://weibo.com/sorry?usernotexists&code=100001。 # 不过用户确实是存在的 # TODO: 判断被封:http://weibo.com/sorry?userblock&is_viewer&code=20003 # http://sass.weibo.com/accessdeny?uid=5445629123&ip=2682434316&location=1&callbackurl=http%3A%2F%2Fweibo.com%2Fu%2F2029154257 # 从用户的信息页面抽取user info user_info_item = UserInfoItem() user_info_item['is_org'] = False # 从原始返回的页面中抽取uid和page_id m = re_UID.search(response.body) if m != None: user_info_item['uid'] = m.group(1) else: log.msg('Error parsing uid: %s' % response.url, log.ERROR) # 抽取page_id m = re_PAGEID.search(response.body) if m != None: user_info_item['page_id'] = m.group(1) else: log.msg('Error parsing page id: %s' % response.url, log.ERROR) # bs4解析 soup = beautiful_soup(response.body) new_style = False profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid.startswith('Pl_Official_LeftInfo__'): info_soup = beautiful_soup(data['html']) info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'}) for block_div in info_div.find_all( 'div', attrs={'class': 'infoblock'}): block_title = block_div.find('form').text.strip() if block_title == u'基本信息': profile_div = block_div elif block_title == u'工作信息': career_div = block_div elif block_title == u'教育信息': edu_div = block_div elif block_title == u'标签信息': tags_div = block_div elif domid.startswith('Pl_Official_PersonalInfo__'): new_style = True info_soup = beautiful_soup(data['html']) for block_div in info_soup.find_all( 'div', attrs={'class': 'WB_cardwrap'}): block_title = block_div.find('h4', attrs={ 'class': 'obj_name' }).text.strip() inner_div = block_div.find( 'div', attrs={'class': 'WB_innerwrap'}) if block_title == u'基本信息': profile_div = inner_div elif block_title == u'工作信息': career_div = inner_div elif block_title == u'教育信息': edu_div = inner_div elif block_title == u'标签信息': tags_div = inner_div elif domid == 'Pl_Official_Header__1': header_soup = beautiful_soup(data['html']) user_info_item['avatar'] = header_soup.find('div', attrs={'class': 'pf_head_pic'})\ .find('img')['src'] user_info_item['n_follows'] = int(header_soup.find('ul', attrs={'class': 'user_atten'})\ .find('strong', attrs={'node-type': 'follow'}).text) user_info_item['n_fans'] = int(header_soup.find('ul', attrs={'class': 'user_atten'})\ .find('strong', attrs={'node-type': 'fans'}).text) elif domid.startswith('Pl_Core_T8CustomTriColumn__'): # new style friends info header_soup = beautiful_soup(data['html']) tds = header_soup.find('table', attrs={'class': 'tb_counter'})\ .find_all('td') user_info_item['n_follows'] = int( tds[0].find('strong').text) user_info_item['n_fans'] = int(tds[1].find('strong').text) elif domid.startswith('Pl_Official_Headerv6__'): # new style avatar info header_soup = beautiful_soup(data['html']) user_info_item['avatar'] = header_soup.find('p', attrs='photo_wrap')\ .find('img')['src'] #import ipdb; ipdb.set_trace() # 判别该用户是否是认真用户 photo_div = header_soup.find_all('div', attrs={ 'class': 'pf_photo', 'node-type': 'photo' }) if len(photo_div) > 0: result = photo_div[0].find_all( 'a', attrs={'href': 'http://verified.weibo.com/verify'}) if len(result) > 0: user_info_item['is_verified'] = True else: user_info_item['is_verified'] = False else: log.msg('Can not find photo div: %s' % response.url, log.ERROR) elif 'STK' in text: text = text.replace( 'STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) pid = data['pid'] if pid == 'pl_profile_infoBase': profile_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoCareer': career_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoEdu': edu_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoTag': tags_div = beautiful_soup(data['html']) elif pid == 'pl_profile_photo': soup = beautiful_soup(data['html']) user_info_item['avatar'] = soup.find('img')['src'] profile_map = { u'昵称': { 'field': 'nickname' }, u'所在地': { 'field': 'location' }, u'性别': { 'field': 'sex', 'func': lambda s: True if s == u'男' else False }, u'生日': { 'field': 'birth' }, u'博客': { 'field': 'blog' }, u'个性域名': { 'field': 'site' }, u'简介': { 'field': 'intro' }, u'邮箱': { 'field': 'email' }, u'QQ': { 'field': 'qq' }, u'MSN': { 'field': 'msn' } } if profile_div is not None: if not new_style: divs = profile_div.find_all(attrs={'class': 'pf_item'}) else: divs = profile_div.find_all('li', attrs={'class': 'li_1'}) for div in divs: if not new_style: k = div.find(attrs={'class': 'label'}).text.strip() v = div.find(attrs={'class': 'con'}).text.strip() else: k = div.find('span', attrs={ 'class': 'pt_title' }).text.strip().strip(u':') d = div.find('span', attrs={'class': 'pt_detail'}) if d: v = d.text.strip() else: v = div.find('a').text.strip() if k in profile_map: if k == u'个性域名' and '|' in v: v = v.split('|')[1].strip() func = (lambda s: s) \ if 'func' not in profile_map[k] \ else profile_map[k]['func'] v = func(v) user_info_item[profile_map[k]['field']] = v #setattr(user_info_item, profile_map[k]['field'], v) user_info_item['work'] = [] if career_div is not None: if not new_style: for div in career_div.find_all(attrs={'class': 'con'}): work_info = dict() ps = div.find_all('p') for p in ps: a = p.find('a') if a is not None: work_info['name'] = a.text text = p.text if '(' in text: work_info['date'] = text.strip().split( '(')[1].strip(')') else: text = p.text if text.startswith(u'地区:'): work_info['location'] = text.split(u':', 1)[1] elif text.startswith(u'职位:'): work_info['position'] = text.split(u':', 1)[1] else: work_info['detail'] = text user_info_item['work'].append(work_info) else: li = career_div.find('li', attrs={'class': 'li_1'}) for span in li.find_all('span', attrs={'class': 'pt_detail'}): work_info = dict() text = span.text a = span.find('a') if a is not None: work_info['name'] = a.text if '(' in text: work_info['date'] = text.strip().split('(')[1]\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .split(')', 1)[0] for l in text.split('\r\n'): l = l.strip() if len(l) == 0: continue if l.startswith(u'地区:'): work_info['location'] = l.split(u':', 1)[1] elif l.startswith(u'职位:'): work_info['position'] = l.split(u':', 1)[1] else: work_info['detail'] = text.replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .strip() user_info_item['work'].append(work_info) user_info_item['edu'] = [] if edu_div is not None: if not new_style: for div in edu_div.find_all(attrs={'class': 'con'}): edu_info = dict() ps = div.find_all('p') for p in ps: a = p.find('a') text = p.text if a is not None: edu_info['name'] = a.text if '(' in text: edu_info['date'] = text.strip().split( '(')[1].strip().strip(')') else: edu_info['detail'] = text user_info_item['edu'].append(edu_info) else: span = edu_div.find('li', attrs={'class': 'li_1'})\ .find('span', attrs={'class': 'pt_detail'}) text = span.text names = [] for a in span.find_all('a'): names.append(a.text) for idx, name in enumerate(names): start_pos = text.find(name) + len(name) if idx < len(names) - 1: end_pos = text.find(names[idx + 1], start_pos) else: end_pos = len(text) t = text[start_pos:end_pos] edu_info = dict() edu_info['name'] = name if '(' in text: edu_info['date'] = t.strip().split('(')[1]\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .split(')', 1)[0] t = t[t.find(')') + 1:] text = text[end_pos:] edu_info['detail'] = t.replace('\r', '').replace('\n', '')\ .replace('\t', '').strip() user_info_item['edu'].append(edu_info) user_info_item['tags'] = [] if tags_div is not None: if not new_style: for div in tags_div.find_all(attrs={'class': 'con'}): for a in div.find_all('a'): user_info_item['tags'].append(a.text) else: for a in tags_div.find('span', attrs={ 'class': 'pt_detail' }).find_all('a'): user_info_item['tags'].append(a.text.strip()) # 如果打算将item包含进去,则is_valid则为True log.msg('parse %s finish' % response.url, log.INFO) # 检查spider是否已经被封 if not user_info_item['n_follows']: log.msg('The spider may have been banned.', log.ERROR) else: if is_valid: #print user_info_item yield user_info_item
def parse_org_info(self, response): """ 抽取组织主页的信息 """ request_list = self.make_request_list() for request in request_list: self.crawler.stats.inc_value('request_issued') yield request user_info_item = UserInfoItem() user_info_item['is_org'] = True user_info_item['uid'] = response.meta['uid'] #import ipdb; ipdb.set_trace() # find user name # 发生了跳转,则意味着该帐号有个性域名 m = re_UserNamePage.search(response.url) if m != None: user_info_item['username'] = user_info_item['uid'] else: m = re_Site.search(response.url) user_info_item['username'] = m.group(1) soup = beautiful_soup(response.body) for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid.startswith('Pl_Core_T8CustomTriColumn__'): info_soup = beautiful_soup(data['html']) td_list = info_soup.find_all('td', attrs={'class': 'S_line1'}) if len(td_list) != 3: log.msg('Error parsing: %s' % response.url, log.INFO) else: user_info_item['n_follows'] = int( td_list[0].find('strong').text.strip()) user_info_item['n_fans'] = int( td_list[1].find('strong').text.strip()) elif domid.startswith('Pl_Core_UserInfo__'): info_soup = beautiful_soup(data['html']) user_info_item['nickname'] = info_soup.find( 'p', attrs={ 'class': 'info' }).find('span').text.strip() ul_list = info_soup.find('ul', attrs={'class': 'ul_detail'}) li_list = ul_list.find_all( 'li', attrs={'class': 'item S_line2 clearfix'}) if len(li_list) == 0: log.msg('Error parsing: %s' % response.url, log.INFO) else: user_info_item['category'] = li_list[0].find( 'span', attrs={ 'class': 'item_text W_fl' }).text.strip() if len(li_list) > 1: user_info_item['intro'] = li_list[1].find( 'span', attrs={ 'class': 'item_text W_fl' }).text.strip() else: pass #print user_info_item #import ipdb; ipdb.set_trace() yield user_info_item
def parse_user_card_info_text(self, uid, html_data): """ 给定html body解析 user info, 用户信息包括:avatar, uid, nickname, desc, location NOTE: 将该解析单独作为一个函数有两个作用:1)便于单独测试;2)使得parse_user_card_info函数结构清晰 """ user_info_item = UserInfoItem() #user_info_item['raw_html'] = html_data # 保存所有的原始数据 user_info_item['uid'] = uid user_info_item['existed'] = True soup = beautiful_soup(html_data) nc_head = soup.find('div', attrs={'class': 'nc_head'}) nc_content = soup.find('div', attrs={'class': 'nc_content'}) pic_box = nc_head.find('div', attrs={'class': 'pic_box'}) alist = pic_box.find_all('a') url = alist[0]['href'] m = re_UserCardUrl.search(url) user_info_item['username'] = m.group(1) if user_info_item['username'][:2] == 'u/': user_info_item['username'] = user_info_item['username'][2:] user_info_item['nickname'] = pic_box.a.img['title'] user_info_item['avatar'] = pic_box.a.img['src'] user_info_item['is_org'] = False if len(alist) > 1: user_info_item['is_verified'] = True # 判断该帐号是组织帐号还是认证用户 if alist[1].i['class'][1] == 'icon_pf_approve_co': user_info_item['is_org'] = True else: user_info_item['is_verified'] = False mask = nc_head.find('div', attrs={'class': 'mask'}) name = mask.find('div', attrs={'class': 'name'}) user_info_item['nickname'] = name.a['title'] if name.em['title'] == u'男': user_info_item['sex'] = True else: user_info_item['sex'] = False intro = mask.find('div', attrs={'class': 'intro W_autocut'}) if intro.text.strip() != '': # 可能个人简介不存在 user_info_item['intro'] = intro.span['title'] # 关注数和粉丝数 def parse_number(text): # 抽取关注数和粉丝数 num = int(re.search(r'\d+', text).group()) if text.find(u'万') >= 0: num *= 10000 return num follow_text = nc_content.find('span', attrs={ 'class': 'c_follow W_fb' }).text.strip() user_info_item['n_follows'] = parse_number(follow_text) fans_text = nc_content.find('span', attrs={ 'class': 'c_fans W_fb' }).text.strip() user_info_item['n_fans'] = parse_number(fans_text) weibo_text = nc_content.find('span', attrs={ 'class': 'c_weibo W_fb' }).text.strip() user_info_item['n_weibo'] = parse_number(weibo_text) user_info_list = nc_content.find_all('li', attrs={'class': 'info_li'}) if len( user_info_list ) >= 1: # 有可能用户card不含有location信息,如:http://weibo.com/1947597977/info user_info_item['location'] = user_info_list[0].a['title'] if len(user_info_list) >= 2: if user_info_list[1].text.find(u'毕业于') >= 0: user_info_item['edu'] = user_info_list[1].a['title'] if len(user_info_list) >= 3: user_info_item['work'] = user_info_list[2].a['title'] else: user_info_item['work'] = user_info_list[1].a['title'] return user_info_item
def parse_user_info(self, response): # 默认的回调函数 """ 普通用户信息抓取,例如: """ #import ipdb; ipdb.set_trace() # 添加接下来要处理的request request_list = self.make_request_list() for request in request_list: self.crawler.stats.inc_value('request_issued') yield request is_valid = True # 判别是否是合法的item log.msg('Parse url: %s' % response.url, level=log.INFO) #log.msg('Response body: %s' % response.body) # 如果是组织页面,那么会返回错误页面 if response.url.find('pagenotfound') > 0: # 匹配源request #import ipdb; ipdb.set_trace() is_valid = False log.msg('Page not found: %s' % response.url, log.ERROR) uid = response.meta['uid'] new_url = 'http://weibo.com/u/%s' % (uid) # 直接访问组织帐号的首页 request = Request(url=new_url, callback=self.parse_org_info, meta={'uid':uid}) self.crawler.stats.inc_value('request_issued') yield request # TODO: 判断用户是否存在, http://weibo.com/sorry?usernotexists&code=100001。 # 不过用户确实是存在的 # TODO: 判断被封:http://weibo.com/sorry?userblock&is_viewer&code=20003 # http://sass.weibo.com/accessdeny?uid=5445629123&ip=2682434316&location=1&callbackurl=http%3A%2F%2Fweibo.com%2Fu%2F2029154257 # 从用户的信息页面抽取user info user_info_item = UserInfoItem() user_info_item['is_org'] = False # 从原始返回的页面中抽取uid和page_id m = re_UID.search(response.body) if m != None: user_info_item['uid'] = m.group(1) else: log.msg('Error parsing uid: %s' % response.url, log.ERROR) # 抽取page_id m = re_PAGEID.search(response.body) if m != None: user_info_item['page_id'] = m.group(1) else: log.msg('Error parsing page id: %s' % response.url, log.ERROR) # bs4解析 soup = beautiful_soup(response.body) new_style = False profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid.startswith('Pl_Official_LeftInfo__'): info_soup = beautiful_soup(data['html']) info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'}) for block_div in info_div.find_all('div', attrs={'class': 'infoblock'}): block_title = block_div.find('form').text.strip() if block_title == u'基本信息': profile_div = block_div elif block_title == u'工作信息': career_div = block_div elif block_title == u'教育信息': edu_div = block_div elif block_title == u'标签信息': tags_div = block_div elif domid.startswith('Pl_Official_PersonalInfo__'): new_style = True info_soup = beautiful_soup(data['html']) for block_div in info_soup.find_all('div', attrs={'class': 'WB_cardwrap'}): block_title = block_div.find('h4', attrs={'class': 'obj_name'}).text.strip() inner_div = block_div.find('div', attrs={'class': 'WB_innerwrap'}) if block_title == u'基本信息': profile_div = inner_div elif block_title == u'工作信息': career_div = inner_div elif block_title == u'教育信息': edu_div = inner_div elif block_title == u'标签信息': tags_div = inner_div elif domid == 'Pl_Official_Header__1': header_soup = beautiful_soup(data['html']) user_info_item['avatar'] = header_soup.find('div', attrs={'class': 'pf_head_pic'})\ .find('img')['src'] user_info_item['n_follows'] = int(header_soup.find('ul', attrs={'class': 'user_atten'})\ .find('strong', attrs={'node-type': 'follow'}).text) user_info_item['n_fans'] = int(header_soup.find('ul', attrs={'class': 'user_atten'})\ .find('strong', attrs={'node-type': 'fans'}).text) elif domid.startswith('Pl_Core_T8CustomTriColumn__'): # new style friends info header_soup = beautiful_soup(data['html']) tds = header_soup.find('table', attrs={'class': 'tb_counter'})\ .find_all('td') user_info_item['n_follows'] = int(tds[0].find('strong').text) user_info_item['n_fans'] = int(tds[1].find('strong').text) elif domid.startswith('Pl_Official_Headerv6__'): # new style avatar info header_soup = beautiful_soup(data['html']) user_info_item['avatar'] = header_soup.find('p', attrs='photo_wrap')\ .find('img')['src'] #import ipdb; ipdb.set_trace() # 判别该用户是否是认真用户 photo_div = header_soup.find_all('div', attrs={'class':'pf_photo', 'node-type':'photo'}) if len(photo_div) > 0: result = photo_div[0].find_all('a', attrs={'href':'http://verified.weibo.com/verify'}) if len(result) > 0: user_info_item['is_verified'] = True else: user_info_item['is_verified'] = False else: log.msg('Can not find photo div: %s' % response.url, log.ERROR) elif 'STK' in text: text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) pid = data['pid'] if pid == 'pl_profile_infoBase': profile_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoCareer': career_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoEdu': edu_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoTag': tags_div = beautiful_soup(data['html']) elif pid == 'pl_profile_photo': soup = beautiful_soup(data['html']) user_info_item['avatar'] = soup.find('img')['src'] profile_map = { u'昵称': {'field': 'nickname'}, u'所在地': {'field': 'location'}, u'性别': {'field': 'sex', 'func': lambda s: True if s == u'男' else False}, u'生日': {'field': 'birth'}, u'博客': {'field': 'blog'}, u'个性域名': {'field': 'site'}, u'简介': {'field': 'intro'}, u'邮箱': {'field': 'email'}, u'QQ': {'field': 'qq'}, u'MSN': {'field': 'msn'} } if profile_div is not None: if not new_style: divs = profile_div.find_all(attrs={'class': 'pf_item'}) else: divs = profile_div.find_all('li', attrs={'class': 'li_1'}) for div in divs: if not new_style: k = div.find(attrs={'class': 'label'}).text.strip() v = div.find(attrs={'class': 'con'}).text.strip() else: k = div.find('span', attrs={'class': 'pt_title'}).text.strip().strip(u':') d = div.find('span', attrs={'class': 'pt_detail'}) if d: v = d.text.strip() else: v = div.find('a').text.strip() if k in profile_map: if k == u'个性域名' and '|' in v: v = v.split('|')[1].strip() func = (lambda s: s) \ if 'func' not in profile_map[k] \ else profile_map[k]['func'] v = func(v) user_info_item[profile_map[k]['field']] = v #setattr(user_info_item, profile_map[k]['field'], v) user_info_item['work'] = [] if career_div is not None: if not new_style: for div in career_div.find_all(attrs={'class': 'con'}): work_info = dict() ps = div.find_all('p') for p in ps: a = p.find('a') if a is not None: work_info['name'] = a.text text = p.text if '(' in text: work_info['date'] = text.strip().split('(')[1].strip(')') else: text = p.text if text.startswith(u'地区:'): work_info['location'] = text.split(u':', 1)[1] elif text.startswith(u'职位:'): work_info['position'] = text.split(u':', 1)[1] else: work_info['detail'] = text user_info_item['work'].append(work_info) else: li = career_div.find('li', attrs={'class': 'li_1'}) for span in li.find_all('span', attrs={'class': 'pt_detail'}): work_info = dict() text = span.text a = span.find('a') if a is not None: work_info['name'] = a.text if '(' in text: work_info['date'] = text.strip().split('(')[1]\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .split(')', 1)[0] for l in text.split('\r\n'): l = l.strip() if len(l) == 0: continue if l.startswith(u'地区:'): work_info['location'] = l.split(u':', 1)[1] elif l.startswith(u'职位:'): work_info['position'] = l.split(u':', 1)[1] else: work_info['detail'] = text.replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .strip() user_info_item['work'].append(work_info) user_info_item['edu'] = [] if edu_div is not None: if not new_style: for div in edu_div.find_all(attrs={'class': 'con'}): edu_info = dict() ps = div.find_all('p') for p in ps: a = p.find('a') text = p.text if a is not None: edu_info['name'] = a.text if '(' in text: edu_info['date'] = text.strip().split('(')[1].strip().strip(')') else: edu_info['detail'] = text user_info_item['edu'].append(edu_info) else: span = edu_div.find('li', attrs={'class': 'li_1'})\ .find('span', attrs={'class': 'pt_detail'}) text = span.text names = [] for a in span.find_all('a'): names.append(a.text) for idx, name in enumerate(names): start_pos = text.find(name) + len(name) if idx < len(names) - 1: end_pos = text.find(names[idx+1], start_pos) else: end_pos = len(text) t = text[start_pos: end_pos] edu_info = dict() edu_info['name'] = name if '(' in text: edu_info['date'] = t.strip().split('(')[1]\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .split(')', 1)[0] t = t[t.find(')')+1:] text = text[end_pos:] edu_info['detail'] = t.replace('\r', '').replace('\n', '')\ .replace('\t', '').strip() user_info_item['edu'].append(edu_info) user_info_item['tags'] = [] if tags_div is not None: if not new_style: for div in tags_div.find_all(attrs={'class': 'con'}): for a in div.find_all('a'): user_info_item['tags'].append(a.text) else: for a in tags_div.find('span', attrs={'class': 'pt_detail'}).find_all('a'): user_info_item['tags'].append(a.text.strip()) # 如果打算将item包含进去,则is_valid则为True log.msg('parse %s finish' % response.url, log.INFO) # 检查spider是否已经被封 if not user_info_item['n_follows']: log.msg('The spider may have been banned.', log.ERROR) else: if is_valid: #print user_info_item yield user_info_item
def parse_user_card_info_text(self, uid, html_data): """ 给定html body解析 user info, 用户信息包括:avatar, uid, nickname, desc, location NOTE: 将该解析单独作为一个函数有两个作用:1)便于单独测试;2)使得parse_user_card_info函数结构清晰 """ user_info_item = UserInfoItem() #user_info_item['raw_html'] = html_data # 保存所有的原始数据 user_info_item['uid'] = uid user_info_item['existed'] = True soup = beautiful_soup(html_data) nc_head = soup.find('div', attrs={'class':'nc_head'}) nc_content = soup.find('div', attrs={'class':'nc_content'}) pic_box = nc_head.find('div', attrs={'class':'pic_box'}) alist = pic_box.find_all('a') url = alist[0]['href'] m = re_UserCardUrl.search(url) user_info_item['username'] = m.group(1) if user_info_item['username'][:2] == 'u/': user_info_item['username'] = user_info_item['username'][2:] user_info_item['nickname'] = pic_box.a.img['title'] user_info_item['avatar'] = pic_box.a.img['src'] user_info_item['is_org'] = False if len(alist) > 1: user_info_item['is_verified'] = True # 判断该帐号是组织帐号还是认证用户 if alist[1].i['class'][1] == 'icon_pf_approve_co': user_info_item['is_org'] = True else: user_info_item['is_verified'] = False mask = nc_head.find('div', attrs={'class':'mask'}) name = mask.find('div', attrs={'class':'name'}) user_info_item['nickname'] = name.a['title'] if name.em['title'] == u'男': user_info_item['sex'] = True else: user_info_item['sex'] = False intro = mask.find('div', attrs={'class':'intro W_autocut'}) if intro.text.strip() != '': # 可能个人简介不存在 user_info_item['intro'] = intro.span['title'] # 关注数和粉丝数 def parse_number(text): # 抽取关注数和粉丝数 num = int(re.search(r'\d+', text).group()) if text.find(u'万') >= 0: num *= 10000 return num follow_text = nc_content.find('span', attrs={'class':'c_follow W_fb'}).text.strip() user_info_item['n_follows'] = parse_number(follow_text) fans_text = nc_content.find('span', attrs={'class':'c_fans W_fb'}).text.strip() user_info_item['n_fans'] = parse_number(fans_text) weibo_text = nc_content.find('span', attrs={'class':'c_weibo W_fb'}).text.strip() user_info_item['n_weibo'] = parse_number(weibo_text) user_info_list = nc_content.find_all('li', attrs={'class':'info_li'}) if len(user_info_list) >= 1: # 有可能用户card不含有location信息,如:http://weibo.com/1947597977/info user_info_item['location'] = user_info_list[0].a['title'] if len(user_info_list) >= 2: if user_info_list[1].text.find(u'毕业于') >= 0: user_info_item['edu'] = user_info_list[1].a['title'] if len(user_info_list) >= 3: user_info_item['work'] = user_info_list[2].a['title'] else: user_info_item['work'] = user_info_list[1].a['title'] return user_info_item