コード例 #1
0
ファイル: parsers.py プロジェクト: huangzhiyong/cola
    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []

        url = url or self.url
        br = self.opener.browse_open(url)
        self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(br.response().read())

        if not self.check(url, br):
            return [], []

        weibo_user = self.get_weibo_user()
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()

        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                text = text.strip().replace(';', '').replace('FM.view(',
                                                             '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid == 'Pl_Official_LeftInfo__13':
                    info_soup = beautiful_soup(data['html'])
                    info_div = info_soup.find('div',
                                              attrs={'class': 'profile_pinfo'})
                    for block_div in info_div.find_all(
                            'div', attrs={'class': 'infoblock'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'基本信息':
                            profile_div = block_div
                        elif block_title == u'工作信息':
                            career_div = block_div
                        elif block_title == u'教育信息':
                            edu_div = block_div
                        elif block_title == u'标签信息':
                            tags_div = block_div
                elif domid == 'Pl_Official_Header__1':
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\
                                                .find('img')['src']
            elif 'STK' in text:
                text = text.replace(
                    'STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                pid = data['pid']
                if pid == 'pl_profile_infoBase':
                    profile_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoCareer':
                    career_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoEdu':
                    edu_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoTag':
                    tags_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_photo':
                    soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = soup.find('img')['src']

        profile_map = {
            u'昵称': {
                'field': 'nickname'
            },
            u'所在地': {
                'field': 'location'
            },
            u'性别': {
                'field': 'sex',
                'func': lambda s: True if s == u'男' else False
            },
            u'生日': {
                'field': 'birth'
            },
            u'博客': {
                'field': 'blog'
            },
            u'个性域名': {
                'field': 'site'
            },
            u'简介': {
                'field': 'intro'
            },
            u'邮箱': {
                'field': 'email'
            },
            u'QQ': {
                'field': 'qq'
            },
            u'MSN': {
                'field': 'msn'
            }
        }
        if profile_div is not None:
            for div in profile_div.find_all(attrs={'class': 'pf_item'}):
                k = div.find(attrs={'class': 'label'}).text.strip()
                v = div.find(attrs={'class': 'con'}).text.strip()
                if k in profile_map:
                    if k == u'个性域名' and '|' in v:
                        v = v.split('|')[1].strip()
                    func = (lambda s: s) \
                            if 'func' not in profile_map[k] \
                            else profile_map[k]['func']
                    v = func(v)
                    setattr(weibo_user.info, profile_map[k]['field'], v)

        weibo_user.info.work = []
        if career_div is not None:
            for div in career_div.find_all(attrs={'class': 'con'}):
                work_info = WorkInfo()
                ps = div.find_all('p')
                for p in ps:
                    a = p.find('a')
                    if a is not None:
                        work_info.name = a.text
                        text = p.text
                        if '(' in text:
                            work_info.date = text.strip().split('(')[1].strip(
                                ')')
                    else:
                        text = p.text
                        if text.startswith(u'地区:'):
                            work_info.location = text.split(u':', 1)[1]
                        elif text.startswith(u'职位:'):
                            work_info.position = text.split(u':', 1)[1]
                        else:
                            work_info.detail = text
                weibo_user.info.work.append(work_info)

        weibo_user.info.edu = []
        if edu_div is not None:
            for div in edu_div.find_all(attrs={'class': 'con'}):
                edu_info = EduInfo()
                ps = div.find_all('p')
                for p in ps:
                    a = p.find('a')
                    text = p.text
                    if a is not None:
                        edu_info.name = a.text
                        if '(' in text:
                            edu_info.date = text.strip().split('(')[1].strip(
                                ')')
                    else:
                        edu_info.detail = text
                weibo_user.info.edu.append(edu_info)

        weibo_user.info.tags = []
        if tags_div is not None:
            for div in tags_div.find_all(attrs={'class': 'con'}):
                for a in div.find_all('a'):
                    weibo_user.info.tags.append(a.text)

        weibo_user.save()
        self.logger.debug('parse %s finish' % url)
        return [], []
コード例 #2
0
ファイル: parsers.py プロジェクト: linVdcd/cola
    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        try:
            br = self.opener.browse_open(url)
        except Exception as e:
            print(e)
            print('休息10分钟!')
            time.sleep(60 * 10)


#         self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(br.response().read())

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()

        new_style = False

        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                text = text.strip().replace(';', '').replace('FM.view(',
                                                             '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid.startswith('Pl_Official_LeftInfo__'):
                    info_soup = beautiful_soup(data['html'])
                    info_div = info_soup.find('div',
                                              attrs={'class': 'profile_pinfo'})
                    for block_div in info_div.find_all(
                            'div', attrs={'class': 'infoblock'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'基本信息':
                            profile_div = block_div
                        elif block_title == u'工作信息':
                            career_div = block_div
                        elif block_title == u'教育信息':
                            edu_div = block_div
                        elif block_title == u'标签信息':
                            tags_div = block_div
                elif domid.startswith('Pl_Official_PersonalInfo__'):
                    new_style = True
                    info_soup = beautiful_soup(data['html'])
                    for block_div in info_soup.find_all(
                            'div', attrs={'class': 'WB_cardwrap'}):
                        block_title_div = block_div.find(
                            'h4', attrs={'class': 'obj_name'})
                        if block_title_div is None:
                            block_title_div = block_div.find('div', attrs={'class': 'obj_name'})\
                                .find('h2')
                        if block_title_div is None:
                            continue
                        block_title = block_title_div.text.strip()
                        inner_div = block_div.find(
                            'div', attrs={'class': 'WB_innerwrap'})
                        if block_title == u'基本信息':
                            profile_div = inner_div
                        elif block_title == u'工作信息':
                            career_div = inner_div
                        elif block_title == u'教育信息':
                            edu_div = inner_div
                        elif block_title == u'标签信息':
                            tags_div = inner_div
                elif domid == 'Pl_Official_Header__1':
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\
                                                .find('img')['src']
                    weibo_user.info.n_follows = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                    .find('strong', attrs={'node-type': 'follow'}).text)
                    weibo_user.info.n_fans = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                 .find('strong', attrs={'node-type': 'fans'}).text)
                elif domid.startswith('Pl_Core_T8CustomTriColumn__'):
                    # new style friends info
                    header_soup = beautiful_soup(data['html'])
                    tds = header_soup.find('table', attrs={'class': 'tb_counter'})\
                                                .find_all('td')
                    weibo_user.info.n_follows = int(tds[0].find('strong').text)
                    weibo_user.info.n_fans = int(tds[1].find('strong').text)
                elif domid.startswith('Pl_Official_Headerv6__'):
                    # new style avatar info
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('p', attrs='photo_wrap')\
                                                .find('img')['src']
            elif 'STK' in text:
                text = text.replace(
                    'STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                pid = data['pid']
                if pid == 'pl_profile_infoBase':
                    profile_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoCareer':
                    career_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoEdu':
                    edu_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoTag':
                    tags_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_photo':
                    soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = soup.find('img')['src']

        profile_map = {
            u'昵称': {
                'field': 'nickname'
            },
            u'所在地': {
                'field': 'location'
            },
            u'性别': {
                'field': 'sex',
                'func': lambda s: True if s == u'男' else False
            },
            u'生日': {
                'field': 'birth'
            },
            u'博客': {
                'field': 'blog'
            },
            u'个性域名': {
                'field': 'site'
            },
            u'简介': {
                'field': 'intro'
            },
            u'邮箱': {
                'field': 'email'
            },
            u'QQ': {
                'field': 'qq'
            },
            u'MSN': {
                'field': 'msn'
            }
        }
        if profile_div is not None:
            if not new_style:
                divs = profile_div.find_all(attrs={'class': 'pf_item'})
            else:
                divs = profile_div.find_all('li', attrs={'class': 'li_1'})
            for div in divs:
                if not new_style:
                    k = div.find(attrs={'class': 'label'}).text.strip()
                    v = div.find(attrs={'class': 'con'}).text.strip()
                else:
                    k = div.find('span', attrs={
                        'class': 'pt_title'
                    }).text.strip().strip(u':')
                    d = div.find('span', attrs={'class': 'pt_detail'})
                    if d:
                        v = d.text.strip()
                    else:
                        v = div.find('a').text.strip()
                if k in profile_map:
                    if k == u'个性域名' and '|' in v:
                        v = v.split('|')[1].strip()
                    func = (lambda s: s) \
                            if 'func' not in profile_map[k] \
                            else profile_map[k]['func']
                    v = func(v)
                    setattr(weibo_user.info, profile_map[k]['field'], v)

        weibo_user.info.work = []
        if career_div is not None:
            if not new_style:
                for div in career_div.find_all(attrs={'class': 'con'}):
                    work_info = WorkInfo()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        if a is not None:
                            work_info.name = a.text
                            text = p.text
                            if '(' in text:
                                work_info.date = text.strip().split(
                                    '(')[1].strip(')')
                        else:
                            text = p.text
                            if text.startswith(u'地区:'):
                                work_info.location = text.split(u':', 1)[1]
                            elif text.startswith(u'职位:'):
                                work_info.position = text.split(u':', 1)[1]
                            else:
                                work_info.detail = text
                    weibo_user.info.work.append(work_info)
            else:
                li = career_div.find('li', attrs={'class': 'li_1'})
                for span in li.find_all('span', attrs={'class': 'pt_detail'}):
                    work_info = WorkInfo()

                    text = span.text
                    a = span.find('a')
                    if a is not None:
                        work_info.name = a.text
                    if '(' in text:
                        work_info.date = text.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]

                    for l in text.split('\r\n'):
                        l = l.strip()
                        if len(l) == 0:
                            continue
                        if l.startswith(u'地区:'):
                            work_info.location = l.split(u':', 1)[1]
                        elif l.startswith(u'职位:'):
                            work_info.position = l.split(u':', 1)[1]
                        else:
                            work_info.detail = text.replace('\r', '')\
                                                    .replace('\n', '')\
                                                    .replace('\t', '')\
                                                    .strip()

                    weibo_user.info.work.append(work_info)

        weibo_user.info.edu = []
        if edu_div is not None:
            if not new_style:
                for div in edu_div.find_all(attrs={'class': 'con'}):
                    edu_info = EduInfo()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        text = p.text
                        if a is not None:
                            edu_info.name = a.text
                            if '(' in text:
                                edu_info.date = text.strip().split(
                                    '(')[1].strip().strip(')')
                        else:
                            edu_info.detail = text
                    weibo_user.info.edu.append(edu_info)
            else:
                span = edu_div.find('li', attrs={'class': 'li_1'})\
                                .find('span', attrs={'class': 'pt_detail'})
                text = span.text
                names = []
                for a in span.find_all('a'):
                    names.append(a.text)

                for idx, name in enumerate(names):
                    start_pos = text.find(name) + len(name)
                    if idx < len(names) - 1:
                        end_pos = text.find(names[idx + 1], start_pos)
                    else:
                        end_pos = len(text)
                    t = text[start_pos:end_pos]

                    edu_info = EduInfo()
                    edu_info.name = name
                    if '(' in text:
                        edu_info.date = t.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]
                        t = t[t.find(')') + 1:]
                    text = text[end_pos:]
                    edu_info.detail = t.replace('\r', '').replace('\n', '')\
                                        .replace('\t', '').strip()
                    weibo_user.info.edu.append(edu_info)

        weibo_user.info.tags = []
        if tags_div is not None:
            if not new_style:
                for div in tags_div.find_all(attrs={'class': 'con'}):
                    for a in div.find_all('a'):
                        weibo_user.info.tags.append(a.text)
            else:
                for a in tags_div.find('span', attrs={
                        'class': 'pt_detail'
                }).find_all('a'):
                    weibo_user.info.tags.append(a.text.strip())

        weibo_user.save()
        #         self.logger.debug('parse %s finish' % url)

        # counter add one for the profile url
        self.counter.inc('processed_profile_page', 1)
コード例 #3
0
ファイル: parsers.py プロジェクト: brightgems/cola
class UserHomePageParser(WeiboParser):
    def extract_user_info(self, soup, weibo_user):
        div_pi = soup.find('div', attrs={'class', 'PCD_person_info'})
        # verfiy cop
        bs_verify = div_pi.find('a', attrs={'class': 'icon_verify_co_v'})
        weibo_user.info.is_person = False if bs_verify else True

        # vip person
        bs_vip = div_pi.find('a', attrs={'class': 'icon_verify_v'})
        weibo_user.info.vip = True if bs_vip else False
        weibo_user.info.verified = True if bs_verify or bs_vip else False
        weibo_user.info.level = int(
            div_pi.find('a', attrs={
                'class': 'W_icon_level'
            }).text.split('.')[1])

    def extract_user_counter(self, soup, weibo_user):
        # msg counter
        tds = soup.find('table', attrs={'class': 'tb_counter'}).find_all('td')

        if tds:
            weibo_user.info.n_follows = int(tds[0].find('strong').text)
            weibo_user.info.n_fans = int(tds[1].find('strong').text)
            weibo_user.info.n_msgs = int(tds[2].find('strong').text)

    def parse(self, url=None):
        if self.bundle.exists is False:
            return
        url = url or self.url
        html = ''
        opener = None

        try:
            #if hasattr(self.opener,'nalbr'):
            #    opener = self.opener.nalbr # no account login browser
            #else:
            #    opener = MechanizeOpener(timeout=10,user_agent=user_config.conf.opener.user_agent)

            #    p_ = get_ip_proxy()
            #    self.logger.info(p_)
            #    opener.add_proxy(p_,'http')
            #    self.opener.nalbr = opener
            opener = self.opener
            opener.addheaders = [('User-Agent',
                                  user_config.conf.opener.user_agent)]
            html = to_unicode(opener.open(url, timeout=10))

            opener.browser.clear_history()  # resolve memory issue
        except Exception, ex:

            if opener:
                opener.browser.close()
            raise Exception("get banned on user page")

        if not html:
            return

        soup = beautiful_soup(html)
        weibo_user = self.get_weibo_user()
        if weibo_user.info is None:
            weibo_user.info = UserInfo()

        # find page_id
        try:
            pid_ = re.findall("CONFIG\['page_id'\]='(.*)';", html)[0]
        except:
            if opener:
                opener.browser.close()
            if hasattr(self.opener, 'nalbr'):
                del self.opener.nalbr

            raise FetchBannedError("get banned on user page")

        domain_ = re.findall("CONFIG\['domain'\]='(.*)';", html)[0]

        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                text = text.strip().replace(';', '').replace('FM.view(',
                                                             '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid.startswith("Pl_Core_UserInfo"):
                    header_soup = beautiful_soup(data['html'])
                    self.extract_user_info(header_soup, weibo_user)
                elif domid.startswith("Pl_Official_Header"):
                    header_soup = beautiful_soup(data['html'])
                    # nickname
                    nickname_ = header_soup.find(
                        'div', attrs={'class', 'pf_username'}).text
                elif domid.startswith("Pl_Core_T8CustomTriColumn"):
                    header_soup = beautiful_soup(data['html'])
                    self.extract_user_counter(header_soup, weibo_user)

        self.bundle.pid = pid_
        self.bundle.domain = domain_
        weibo_user.pid = pid_
        weibo_user.info.domain = domain_
        weibo_user.info.nickname = nickname_.strip()
        weibo_user.save()

        # counter add one for the processed user home list url
        self.counter.inc('processed_weibo_user_home_page', 1)
        time.sleep(1)
        if fetch_userprofile and weibo_user.info.is_person and not weibo_user.info.location:
            yield 'http://weibo.com/p/%s/info' % pid_
コード例 #4
0
    def save_blog_detail(self,div,mblog):
        
        content_div = div.find('p', attrs={'node-type': 'feed_list_content'})
        mblog.content = content_div.text
        blog_create_date = parse(div.find('a',attrs={'node-type':'feed_list_item_date'})['title'])
        mblog.created = blog_create_date
        mblog.last_update = datetime.now()

        is_forward = div.get('isforward')
        if is_forward:
            # write origional user, msg
            mblog.omid = div['omid']
            tbinfos = div['tbinfo'].split('&')
            mblog.ouid = tbinfos[0].split('=')[1]
            name_a = div.find('a', attrs={
                'class': 'WB_name', 
                'node-type': 'feed_list_originNick' 
            })
            text_a = div.find('div', attrs={
                'class': 'WB_text',
                'node-type': 'feed_list_reason'
            })
            if name_a is not None and text_a is not None:
                mblog.forward = '%s: %s' % (name_a.text,
                    text_a.text)
        

        func_div = div.find_all('div', attrs={'class':'feed_action'})[-1]
        action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t)
            
        likes = func_div.find('a', attrs={'action-type': action_type_re("like")}).find('em')
        if likes:
            likes = likes.text.strip('(').strip(')').replace(',','')
            likes = int(likes) if likes and unicode.isdigit(likes) else 0
            mblog.n_likes = likes
        forwards = func_div.find('a', attrs={'action-type': action_type_re("forward")}).find('em')
        if forwards:
            forwards = forwards.text.strip('(').strip(')').replace(',','')
            mblog.n_forwards = int(forwards) if forwards and unicode.isdigit(forwards) else 0
        comments = func_div.find('a', attrs={'action-type': action_type_re('comment')}).find('em')
        if comments:
            comments = comments.text.strip('(').strip(')').replace(',','')
            mblog.n_comments = int(comments) if comments and unicode.isdigit(comments) else 0
        # parse uid
        a = func_div.find('a',attrs={'action-type':'feed_list_forward'})['action-data']
        u = urllib_parse.unquote(a[a.find('url='):])
        qs = urllib_parse.parse_qs(u)
        if not qs.has_key('uid'):
            print(qs)
        mblog.uid = qs['uid'][0]

        # save user
        weibo_user = self.get_weibo_user(mblog.uid)
        if not (weibo_user.info and weibo_user.info.nickname):
            if qs.has_key('pid'):
                weibo_user.pid = qs['pid'][0]
            if weibo_user.info is None:
                weibo_user.info = UserInfo()
                weibo_user.info.nickname = qs['name'][0]
            weibo_user.save()
    
        # has_video
        div_video = div.find('div',attrs={'node-type':'fl_h5_video_disp'}) or div.find('span',attrs={'class':'icon_playvideo'})
        mblog.has_video = True if div_video else False
        mblog.save()
        return (weibo_user,mblog)
コード例 #5
0
class UserHomePageParser(WeiboParser):
    def extract_user_info(self,soup,weibo_user):
        div_pi = soup.find('div',attrs={'class','PCD_person_info'})
        # verfiy cop
        bs_verify = div_pi.find('a',attrs={'class':'icon_verify_co_v'})
        weibo_user.info.is_person = False if bs_verify else True
        
        # vip person
        bs_vip = div_pi.find('a',attrs={'class':'icon_verify_v'})
        weibo_user.info.vip = True if bs_vip else False
        weibo_user.info.verified = True if bs_verify or bs_vip else False
        weibo_user.info.level = int(div_pi.find('a',attrs={'class':'W_icon_level'}).text.split('.')[1])

    def extract_user_counter(self,soup,weibo_user):       
        # msg counter
        tds = soup.find('table', attrs={'class': 'tb_counter'}).find_all('td')

        if tds:
            weibo_user.info.n_follows = int(tds[0].find('strong').text)
            weibo_user.info.n_fans = int(tds[1].find('strong').text)
            weibo_user.info.n_msgs = int(tds[2].find('strong').text)


    def parse(self, url = None):
        url = url or self.url
        html = ''
        opener = self.opener
        
        try:
            
            opener.addheaders = [('User-Agent',user_config.conf.opener.user_agent)]
            html = to_unicode(opener.open(url,timeout=10))
            
            opener.browser.clear_history() # resolve memory issue
        except Exception, ex:
            
            if opener:
                opener.browser.close()
            raise Exception("get banned on user page")
        
        try:
            uid = re.findall("CONFIG\['oid'\]='(.*)';",html)[0]
        except:
            raise FetchBannedError("get banned on blog page")

        soup = beautiful_soup(html)
        weibo_user = self.get_weibo_user(uid)
        if weibo_user.info is None:
            weibo_user.info = UserInfo()

        # find page_id
        try:
            pid_ = re.findall("CONFIG\['page_id'\]='(.*)';",html)[0]
        except:
            raise FetchBannedError("get banned on user page")

        domain_ = re.findall("CONFIG\['domain'\]='(.*)';",html)[0]

        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                text = text.strip().replace(';', '').replace('FM.view(', '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid.startswith("Pl_Core_UserInfo") and data.has_key('html'):
                    header_soup = beautiful_soup(data['html'])
                    self.extract_user_info(header_soup,weibo_user)
                elif domid.startswith("Pl_Official_Header"):  
                    header_soup = beautiful_soup(data['html'])
                    # nickname
                    nickname_ = header_soup.find('div',attrs={'class','pf_username'}).text
                elif domid.startswith("Pl_Core_T8CustomTriColumn") and data.has_key('html'):  
                    header_soup = beautiful_soup(data['html'])
                    self.extract_user_counter(header_soup,weibo_user)

        weibo_user.pid = pid_
        weibo_user.info.domain = domain_
        weibo_user.info.nickname = nickname_.strip()
        weibo_user.save()

        # counter add one for the processed user home list url
        self.counter.inc('processed_weibo_user_home_page', 1)
コード例 #6
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return

        url = url or self.url
        br = self.opener.browse_open(url)
        soup = BeautifulSoup(br.response().read())

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()

        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all('script'):
            text = script.text
            if 'STK' in text:
                text = text.replace(
                    'STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                pid = data['pid']
                if pid == 'pl_profile_infoBase':
                    profile_div = BeautifulSoup(data['html'])
                elif pid == 'pl_profile_infoCareer':
                    career_div = BeautifulSoup(data['html'])
                elif pid == 'pl_profile_infoEdu':
                    edu_div = BeautifulSoup(data['html'])
                elif pid == 'pl_profile_infoTag':
                    tags_div = BeautifulSoup(data['html'])

        profile_map = {
            u'昵称': {
                'field': 'nickname'
            },
            u'所在地': {
                'field': 'location'
            },
            u'性别': {
                'field': 'sex',
                'func': lambda s: True if s == u'男' else False
            },
            u'生日': {
                'field': 'birth'
            },
            u'博客': {
                'field': 'blog'
            },
            u'个性域名': {
                'field': 'site'
            },
            u'简介': {
                'field': 'intro'
            },
            u'邮箱': {
                'field': 'email'
            },
            u'QQ': {
                'field': 'qq'
            },
            u'MSN': {
                'field': 'msn'
            }
        }
        for div in profile_div.find_all(attrs={'class': 'pf_item'}):
            k = div.find(attrs={'class': 'label'}).text.strip()
            v = div.find(attrs={'class': 'con'}).text.strip()
            if k in profile_map:
                func = (lambda s: s) \
                        if 'func' not in profile_map[k] \
                        else profile_map[k]['func']
                v = func(v)
                setattr(weibo_user.info, profile_map[k]['field'], v)

        weibo_user.info.work = []
        for div in career_div.find_all(attrs={'class': 'con'}):
            work_info = WorkInfo()
            ps = div.find_all('p')
            for p in ps:
                a = p.find('a')
                if a is not None:
                    work_info.name = a.text
                    text = p.text
                    if '(' in text:
                        work_info.date = text.strip().split('(')[1].strip(')')
                else:
                    text = p.text
                    if text.startswith(u'地区:'):
                        work_info.location = text.split(':', 1)[1]
                    elif text.startswith(u'职位:'):
                        work_info.position = text.split(':', 1)[1]
                    else:
                        work_info.detail = text
            weibo_user.info.work.append(work_info)

        weibo_user.info.edu = []
        for div in edu_div.find_all(attrs={'class': 'con'}):
            edu_info = EduInfo()
            ps = div.find_all('p')
            for p in ps:
                a = p.find('a')
                text = p.text
                if a is not None:
                    edu_info.name = a.text
                    if '(' in text:
                        edu_info.date = text.strip().split('(')[1].strip(')')
                else:
                    edu_info.detail = text
            weibo_user.info.edu.append(edu_info)

        weibo_user.info.tags = []
        for div in tags_div.find_all(attrs={'class': 'con'}):
            for a in div.find_all('a'):
                weibo_user.info.tags.append(a.text)

        weibo_user.save()
        return [], []
コード例 #7
0
ファイル: parsers.py プロジェクト: brightgems/cola
    def parse(self, url=None):

        url = url or self.url
        try:
            br = self.opener.browse_open(url)
            html = br.response().read()

            if not self.check(url, br):
                return

            self.uid = re.findall("CONFIG\['oid'\]='(.*)';", html)[0]
        except:
            raise FetchBannedError("get banned on blog page")

        weibo_user = self.get_weibo_user(self.uid)
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()

        soup = beautiful_soup(html)
        new_style = False

        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                text = text.strip().replace(';', '').replace('FM.view(',
                                                             '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid.startswith('Pl_Official_LeftInfo__'):
                    info_soup = beautiful_soup(data['html'])
                    info_div = info_soup.find('div',
                                              attrs={'class': 'profile_pinfo'})
                    for block_div in info_div.find_all(
                            'div', attrs={'class': 'infoblock'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'基本信息':
                            profile_div = block_div
                        elif block_title == u'工作信息':
                            career_div = block_div
                        elif block_title == u'教育信息':
                            edu_div = block_div
                        elif block_title == u'标签信息':
                            tags_div = block_div
                elif domid.startswith('Pl_Official_PersonalInfo__'):
                    new_style = True
                    info_soup = beautiful_soup(data['html'])
                    for block_div in info_soup.find_all(
                            'div', attrs={'class': 'WB_cardwrap'}):
                        block_title_div = block_div.find(
                            'h4', attrs={'class': 'obj_name'})
                        if block_title_div is None:
                            block_title_div = block_div.find('div', attrs={'class': 'obj_name'})\
                                .find('h2')
                        if block_title_div is None:
                            continue
                        block_title = block_title_div.text.strip()
                        inner_div = block_div.find(
                            'div', attrs={'class': 'WB_innerwrap'})
                        if block_title == u'基本信息':
                            profile_div = inner_div
                        elif block_title == u'工作信息':
                            career_div = inner_div
                        elif block_title == u'教育信息':
                            edu_div = inner_div
                        elif block_title == u'标签信息':
                            tags_div = inner_div
                elif domid == 'Pl_Official_Header__1' and data.has_key('html'):
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\
                                                .find('img')['src']

                    weibo_user.info.n_follows = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                    .find('strong', attrs={'node-type': 'follow'}).text)
                    weibo_user.info.n_fans = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                 .find('strong', attrs={'node-type': 'fans'}).text)
                elif domid.startswith('Pl_Core_T8CustomTriColumn__'
                                      ) and data.has_key('html'):
                    # new style friends info
                    header_soup = beautiful_soup(data['html'])
                    tds = header_soup.find('table', attrs={'class': 'tb_counter'})\
                                                .find_all('td')
                    weibo_user.info.n_follows = int(tds[0].find('strong').text)
                    weibo_user.info.n_fans = int(tds[1].find('strong').text)
                elif domid.startswith('Pl_Official_Headerv6__'):
                    # new style avatar info
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('p', attrs='photo_wrap')\
                                                .find('img')['src']
                    bs_verified = header_soup.find(
                        'a',
                        attrs={
                            "suda-data": "key=pc_apply_entry&value=feed_icon"
                        })
                    weibo_user.info.verified = True if bs_verified else False
                    bs_vip = header_soup.find(
                        'a',
                        attrs={
                            "suda-uatrack": "key=home_vip&value=home_feed_vip"
                        })
                    weibo_user.info.vip = True if bs_vip else False
                    weibo_user.info.pf_intro = header_soup.find('div',
                                                                attrs={
                                                                    'class':
                                                                    'pf_intro'
                                                                }).text
                elif domid.startswith('Pl_Official_RightGrowNew'):
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.level_score = int(
                        header_soup.find('p', attrs={
                            'class': 'level_info'
                        }).find_all('span', attrs={'class':
                                                   'S_txt1'})[1].text.strip())
                    weibo_user.info.level = int(
                        header_soup.find('p', attrs={
                            'class': 'level_info'
                        }).find_all('span',
                                    attrs={'class': 'S_txt1'
                                           })[0].text.strip().split('.')[1])

            elif 'STK' in text:
                text = text.replace(
                    'STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                pid = data['pid']
                if pid == 'pl_profile_infoBase':
                    profile_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoCareer':
                    career_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoEdu':
                    edu_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoTag':
                    tags_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_photo':
                    soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = soup.find('img')['src']

        profile_map = {
            u'昵称': {
                'field': 'nickname'
            },
            u'所在地': {
                'field': 'location'
            },
            u'性别': {
                'field': 'gender'
            },
            u'生日': {
                'field':
                'birth',
                'func':
                lambda v: datetime.strptime(
                    v.replace(u'年', '/').replace(u'月', '/').replace(u'日', ''),
                    '%Y/%m/%d') if re.match(u'\d+年\d+月\d+日', v) else None
            },
            u'博客': {
                'field': 'blog'
            },
            u'个性域名': {
                'field': 'site'
            },
            u'简介': {
                'field': 'intro'
            },
            u'邮箱': {
                'field': 'email'
            },
            u'QQ': {
                'field': 'qq'
            },
            u'MSN': {
                'field': 'msn'
            },
            u'注册时间': {
                'field': 'register_date'
            }
        }
        if profile_div is not None:
            if not new_style:
                divs = profile_div.find_all(attrs={'class': 'pf_item'})
            else:
                divs = profile_div.find_all('li', attrs={'class': 'li_1'})
            for div in divs:
                if not new_style:
                    k = div.find(attrs={'class': 'label'}).text.strip()
                    v = div.find(attrs={'class': 'con'}).text.strip()
                else:
                    k = div.find('span', attrs={
                        'class': 'pt_title'
                    }).text.strip().strip(u':')
                    d = div.find('span', attrs={'class': 'pt_detail'})
                    if d:
                        v = d.text.strip()
                    else:
                        v = div.find('a').text.strip()
                if k in profile_map:
                    if k == u'个性域名' and '|' in v:
                        v = v.split('|')[1].strip()

                    func = (lambda s: s) \
                            if 'func' not in profile_map[k] \
                            else profile_map[k]['func']
                    try:
                        v = func(v)
                    except:
                        v = None
                    setattr(weibo_user.info, profile_map[k]['field'], v)

        weibo_user.info.work = []
        if career_div is not None:
            if not new_style:
                for div in career_div.find_all(attrs={'class': 'con'}):
                    work_info = WorkInfo()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        if a is not None:
                            work_info.name = a.text
                            text = p.text
                            if '(' in text:
                                work_info.date = text.strip().split(
                                    '(')[1].strip(')')
                        else:
                            text = p.text
                            if text.startswith(u'地区:'):
                                work_info.location = text.split(u':', 1)[1]
                            elif text.startswith(u'职位:'):
                                work_info.position = text.split(u':', 1)[1]
                            else:
                                work_info.detail = text
                    weibo_user.info.work.append(work_info)
            else:
                li = career_div.find('li', attrs={'class': 'li_1'})
                for span in li.find_all('span', attrs={'class': 'pt_detail'}):
                    work_info = WorkInfo()

                    text = span.text
                    a = span.find('a')
                    if a is not None:
                        work_info.name = a.text
                    if '(' in text:
                        work_info.date = text.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]

                    for l in text.split('\r\n'):
                        l = l.strip()
                        if len(l) == 0:
                            continue
                        if l.startswith(u'地区:'):
                            work_info.location = l.split(u':', 1)[1]
                        elif l.startswith(u'职位:'):
                            work_info.position = l.split(u':', 1)[1]
                        else:
                            work_info.detail = text.replace('\r', '')\
                                                    .replace('\n', '')\
                                                    .replace('\t', '')\
                                                    .strip()

                    weibo_user.info.work.append(work_info)

        weibo_user.info.edu = []
        if edu_div is not None:
            if not new_style:
                for div in edu_div.find_all(attrs={'class': 'con'}):
                    edu_info = EduInfo()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        text = p.text
                        if a is not None:
                            edu_info.name = a.text
                            if '(' in text:
                                edu_info.date = text.strip().split(
                                    '(')[1].strip().strip(')')
                        else:
                            edu_info.detail = text
                    weibo_user.info.edu.append(edu_info)
            else:
                span = edu_div.find('li', attrs={'class': 'li_1'})\
                                .find('span', attrs={'class': 'pt_detail'})
                text = span.text
                names = []
                for a in span.find_all('a'):
                    names.append(a.text)

                for idx, name in enumerate(names):
                    start_pos = text.find(name) + len(name)
                    if idx < len(names) - 1:
                        end_pos = text.find(names[idx + 1], start_pos)
                    else:
                        end_pos = len(text)
                    t = text[start_pos:end_pos]

                    edu_info = EduInfo()
                    edu_info.name = name
                    if '(' in text:
                        edu_info.date = t.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]
                        t = t[t.find(')') + 1:]
                    text = text[end_pos:]
                    edu_info.detail = t.replace('\r', '').replace('\n', '')\
                                        .replace('\t', '').strip()
                    weibo_user.info.edu.append(edu_info)

        weibo_user.info.tags = []
        if tags_div is not None:
            if not new_style:
                for div in tags_div.find_all(attrs={'class': 'con'}):
                    for a in div.find_all('a'):
                        weibo_user.info.tags.append(a.text)
            else:
                for a in tags_div.find('span', attrs={
                        'class': 'pt_detail'
                }).find_all('a'):
                    weibo_user.info.tags.append(a.text.strip())

        weibo_user.save()

        # counter add one for the profile url
        self.counter.inc('processed_profile_page', 1)