Python beautiful_soup Examples, cola.core.utils.beautiful_soup Python Examples

Example #1

0

Show file

File: parsers.py Project: hewayGitHub/2014-cola

 def parse(self, url=None):
     if self.bundle.exists == False:
         return [], []
     
     url = url or self.url
     br = self.opener.browse_open(url)
     self.logger.debug('load %s finish' % url)
     soup = beautiful_soup(br.response().read())
     
     if not self.check(url, br):
         return [], []
     
     weibo_user = self.get_weibo_user()
     info = weibo_user.info
     if info is None:
         weibo_user.info = UserInfo()
         
     profile_div = None
     relation_div = None
     career_div = None
     edu_div = None
     tags_div = None
     for script in soup.find_all('script'):
         text = script.text
         if text.startswith('FM.view'):
             text = text.strip().replace(';', '').replace('FM.view(', '')[:-1]
             data = json.loads(text)
             if 'domid' not in data or 'html' not in data:
                 # self.logger.warn('domid or html is missing, url:%s' % url)
                 continue
             domid = data['domid']
             if domid.startswith('Pl_Official_PersonalInfo__'):
                 info_soup = beautiful_soup(data['html'])

Example #2

0

Show file

File: parsers.py Project: Andelfin/cola

    def parse(self, url=None):
        if self.bundle.exists is False:
            return
        
        url = url or self.url

        br = self.opener.browse_open(url)
#         self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(br.response().read())
        
        if not self.check(url, br):
            return
        
        weibo_user = self.get_weibo_user()
        
        html = None
        decodes = urldecode(url)
        is_follow = True
        is_new_mode = False
        is_banned = True
        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                if is_banned: is_banned = False
                text = text.strip().replace(';', '').replace('FM.view(', '')[:-1]
                data =  json.loads(text)
                domid = data['domid']
                if domid.startswith('Pl_Official_LeftHisRelation__') or \
                    domid.startswith('Pl_Official_HisRelation__'):
                    html = beautiful_soup(data['html'])
                if 'relate' in decodes and decodes['relate'] == 'fans':
                    is_follow = False
                is_new_mode = True
            elif 'STK' in text:
                if is_banned: is_banned = False
                text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                if data['pid'] == 'pl_relation_hisFollow' or \
                    data['pid'] == 'pl_relation_hisFans':
                    html = beautiful_soup(data['html'])
                if data['pid'] == 'pl_relation_hisFans':
                    is_follow = False

        if is_banned:
            raise FetchBannedError('fetch banned by weibo server')

        ul = None
        try:
            ul = html.find(attrs={'class': 'cnfList', 'node-type': 'userListBox'})
            if ul is None:
                ul = html.find(attrs={'class': 'follow_list', 'node-type': 'userListBox'})
        except AttributeError, e:
            if br.geturl().startswith('http://e.weibo.com'):
                return
            raise e

Example #3

0

Show file

File: parsers.py Project: friedvan/cola

    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []
        
        url = url or self.url
        br = self.opener.browse_open(url)
        self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(br.response().read())
        
        if not self.check(url, br):
            return [], []
        
        weibo_user = self.get_weibo_user()
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()
            
        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        texts = [script.text.strip().replace(';', '').replace('FM.view(', '')[:-1]
                 for script in soup.find_all('script')
                 if script.text.startswith('FM.view')
        ]
        for text in texts:
            try:
                data = json.loads(text)
            except ValueError, e:
                return [], []

            domid = data['domid']
            if domid.startswith('Pl_Official_PersonalInfo__'):
                info_soup = beautiful_soup(data['html'])
                for block_div in info_soup.find_all('div', attrs={'class': 'WB_cardwrap S_bg2'}):
                    block_title = block_div.find('span', attrs={'class': 'main_title'}).text.strip()
                    if block_title == u'基本信息':
                        profile_div = block_div
                    elif block_title == u'工作信息':
                        career_div = block_div
                    elif block_title == u'教育信息':
                        edu_div = block_div
                    elif block_title == u'标签信息':
                        tags_div = block_div

            elif domid.startswith('Pl_Official_Header'):
                header_soup = beautiful_soup(data['html'])
                weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_photo'})\
                                            .find('img')['src']
            elif domid.startswith('Pl_Core_T8CustomTriColumn'):
                follow_soup = beautiful_soup(data['html'])
                follows = follow_soup.find_all('td', attrs={'class': 'S_line1'})

                weibo_user.info.n_follows = int(follows[0].find('strong').text)
                weibo_user.info.n_fans = int(follows[1].find('strong').text)

Example #4

0

Show file

File: readability.py Project: 0pengl/cola

    def _get_article(self, candidates, best_candidate):
        # Now that we have the top candidate, look through its siblings for content that might also be related.
        # Things like preambles, content split by ads that we removed, etc.
        
        sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
        output = beautiful_soup("<div/>")
        for sibling in best_candidate['elem'].parent.contents:
            if isinstance(sibling, NavigableString): continue
            append = False
            if sibling is best_candidate['elem']:
                append = True
            sibling_key = HashableElement(sibling)
            if sibling_key in candidates and \
                candidates[sibling_key]['content_score'] >= sibling_score_threshold:
                append = True

            if sibling.name == "p":
                link_density = self._get_link_density(sibling)
                node_content = sibling.string or ""
                node_length = len(node_content)

                if node_length > 80 and link_density < 0.25:
                    append = True
                elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content):
                    append = True

            if append:
                output.div.append(sibling)
                
        return output

Example #5

0

Show file

File: parsers.py Project: hkharryking/cola

    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url

        br = self.opener.browse_open(url)
        #         self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(br.response().read())

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()

        html = None
        decodes = urldecode(url)
        is_follow = True
        is_new_mode = False
        for script in soup.find_all("script"):
            text = script.text
            if text.startswith("FM.view"):
                text = text.strip().replace(";", "").replace("FM.view(", "")[:-1]
                data = json.loads(text)
                domid = data["domid"]
                if domid.startswith("Pl_Official_LeftHisRelation__") or domid.startswith("Pl_Official_HisRelation__"):
                    html = beautiful_soup(data["html"])
                if "relate" in decodes and decodes["relate"] == "fans":
                    is_follow = False
                is_new_mode = True
            elif "STK" in text:
                text = text.replace("STK && STK.pageletM && STK.pageletM.view(", "")[:-1]
                data = json.loads(text)
                if data["pid"] == "pl_relation_hisFollow" or data["pid"] == "pl_relation_hisFans":
                    html = beautiful_soup(data["html"])
                if data["pid"] == "pl_relation_hisFans":
                    is_follow = False

        ul = None
        try:
            ul = html.find(attrs={"class": "cnfList", "node-type": "userListBox"})
            if ul is None:
                ul = html.find(attrs={"class": "follow_list", "node-type": "userListBox"})
        except AttributeError, e:
            if br.geturl().startswith("http://e.weibo.com"):
                return
            raise e

Example #6

0

Show file

File: parsers.py Project: hellove1985/cola

 def parse(self, url=None):
     if self.bundle.exists == False:
         return [], []
     
     url = url or self.url
     
     br, soup = None, None
     try:
         br = self.opener.browse_open(url)
         soup = beautiful_soup(br.response().read())
     except Exception, e:
         return self._error(url, e)

Example #7

0

Show file

File: parsers.py Project: hewayGitHub/2014-cola

 def parse(self, url=None):
     if self.bundle.exists == False or self.bundle.level >= MAX_LEVEL:
         return [], []
     
     url = url or self.url
     
     br, soup = None, None
     try:
         br = self.opener.browse_open(url)
         self.logger.debug('load %s finish' % url)
         soup = beautiful_soup(br.response().read())
     except Exception, e:
         return self._error(url, e)

Example #8

0

Show file

File: preprocess.py Project: 0pengl/cola

 def process(self, base_url=None):
     self.html = self._remove_crufy_html(self.html)
     
     self.soup = beautiful_soup(self.html, self.logger)
     
     base_url = self.base_url or base_url
     if base_url is not None:
         self._fix_references(base_url)
         
     title = self.get_title(self.soup)
     body = self.get_body(self.soup)
     
     return title, body

Example #9

0

Show file

File: parsers.py Project: chineking/carte

 def parse(self, url=None):
     if self.bundle.exists == False:
         return [], []
     
     url = url or self.url
     br = self.opener.browse_open(url)
     self.logger.debug('load %s finish' % url)
     soup = beautiful_soup(br.response().read())
     
     if not self.check(url, br):
         return [], []
     
     weibo_user = self.get_weibo_user()
     
     weibo_user.qids = []
     lis = soup.find_all('li', attrs={'action-type': 'click_link'})
     for li in lis:
         if li.has_attr('action-data'):
             li_data = li['action-data']
             if '/' in li_data:
                 weibo_user.qids.append(li_data.rsplit('/', 1)[1])
     
     weibo_user.save()
     return [], []

Example #10

0

Show file

File: parsers.py Project: zouxc-zz/cola

    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []

        url = url or self.url
        br = None
        jsn = None
        try:
            br = self.opener.browse_open(url)
            self.logger.debug('load %s finish' % url)
            jsn = json.loads(br.response().read())
        except (ValueError, URLError) as e:
            return self._error(url, e)

        soup = beautiful_soup(jsn['data']['html'])
        current_page = jsn['data']['page']['pagenum']
        n_pages = jsn['data']['page']['totalpage']

        if not self.check(url, br):
            return [], []

        decodes = urldecode(url)
        mid = decodes.get('id', decodes.get('mid'))

        mblog = self.bundle.current_mblog
        if mblog is None or mblog.mid != mid:
            try:
                mblog = getattr(MicroBlog,
                                'objects').get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
                mblog.save()

        def set_instance(instance, dl):
            instance.avatar = dl.find('dt').find('img')['src']
            date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text
            date = date.strip().strip('(').strip(')')
            instance.created = self.parse_datetime(date)
            for div in dl.find_all('div'):
                div.extract()
            for span in dl.find_all('span'):
                span.extract()
            instance.content = dl.text.strip()

        if url.startswith('http://weibo.com/aj/comment'):
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                uid = dl.find('a', usercard=True)['usercard'].split("id=",
                                                                    1)[1]
                comment = Comment(uid=uid)
                set_instance(comment, dl)

                mblog.comments.append(comment)
        elif url.startswith('http://weibo.com/aj/mblog/info'):
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                forward_again_a = dl.find(
                    'a',
                    attrs={
                        'action-type': re.compile("^(feed_list|fl)_forward$")
                    })
                uid = urldecode('?%s' % forward_again_a['action-data'])['uid']
                forward = Forward(uid=uid, mid=dl['mid'])
                set_instance(forward, dl)

                mblog.forwards.append(forward)
        elif url.startswith('http://weibo.com/aj/like'):
            lis = soup.find_all('li', uid=True)
            for li in lis:
                like = Like(uid=li['uid'])
                like.avatar = li.find('img')['src']

                mblog.likes.append(like)

        try:
            mblog.save()
            self.logger.debug('parse %s finish' % url)
        except ValidationError, e:
            return self._error(url, e)

Example #11

0

Show file

File: parsers.py Project: keyihao/Weibo_Cola

    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []
        
        url = url or self.url
        br = None
        jsn = None
        try:
            br = self.opener.browse_open(url)
            self.logger.debug('load %s finish' % url)
            jsn = json.loads(br.response().read())
        except (ValueError, URLError) as e:
            return self._error(url, e)
        
        soup = beautiful_soup(jsn['data']['html'])
        current_page = jsn['data']['page']['pagenum']
        n_pages = jsn['data']['page']['totalpage']
        
        if not self.check(url, br):
            return [], []
        
        decodes = urldecode(url)
        mid = decodes.get('id', decodes.get('mid'))
        
        mblog = self.bundle.current_mblog
        if mblog is None or mblog.mid != mid:
            try:
                mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
                mblog.save()

        def set_instance(instance, dl):
            instance.avatar = dl.find('dt').find('img')['src']
            date_source = dl.find('dd').find('span', attrs={'class': 'S_txt2'})
            if date_source is not None:
                date = date_source.text
            else:
                date_source = dl.find('dd').find('span',attrs={'class':'fl'}).find('em',attrs={'class': 'S_txt2'})
                date = date_source.text
            date = date.strip().strip('(').strip(')')
            instance.created = self.parse_datetime(date)
            for div in dl.find_all('div'): div.extract()
            for span in dl.find_all('span'): span.extract()
            instance.content = dl.text.strip()
        
        if url.startswith('http://weibo.com/aj/comment'):
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                if fetch_comment_limit > 0 and self.bundle.fetched_weibo_comment_num >= fetch_comment_limit:
                    self.bundle.fetched_weibo_comment_num = 0;
                    try:
                        mblog.save()
                        self.logger.debug('parse %s finish' % url)
                    except ValidationError, e:
                        return self._error(url, e)
                    return [],[]
                link = dl.find('a',attrs={'action-type': 'replycomment'})
                data = dict([l.split('=') for l in link['action-data'].split('&')]) 
                if fetch_comment_limit > 0 and self.bundle.fetched_last_comment_id != data['mid']:
                    self.bundle.fetched_weibo_comment_num = 0;
                    
                comment = Comment(uid=data['ouid'], mid=data['mid'])
                set_instance(comment, dl)
                
                mblog.comments.append(comment)
                self.bundle.fetched_last_comment_id = data['mid']
                self.bundle.fetched_weibo_comment_num = self.bundle.fetched_weibo_comment_num + 1;

Example #12

0

Show file

File: parsers.py Project: Andelfin/cola

    def parse(self, url=None):
        if self.bundle.exists is False:
            return
        
        url = url or self.url
        br = self.opener.browse_open(url)
#         self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(br.response().read())
        
        if not self.check(url, br):
            return
        
        weibo_user = self.get_weibo_user()
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()
            
        new_style = False
        
        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                text = text.strip().replace(';', '').replace('FM.view(', '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid.startswith('Pl_Official_LeftInfo__'):
                    info_soup = beautiful_soup(data['html'])
                    info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'})
                    for block_div in info_div.find_all('div', attrs={'class': 'infoblock'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'基本信息':
                            profile_div = block_div
                        elif block_title == u'工作信息':
                            career_div = block_div
                        elif block_title == u'教育信息':
                            edu_div = block_div
                        elif block_title == u'标签信息':
                            tags_div = block_div
                elif domid.startswith('Pl_Official_PersonalInfo__'):
                    new_style = True
                    info_soup = beautiful_soup(data['html'])
                    for block_div in info_soup.find_all('div', attrs={'class': 'WB_cardwrap'}):
                        block_title_div = block_div.find('h4', attrs={'class': 'obj_name'})
                        if block_title_div is None:
                            block_title_div = block_div.find('div', attrs={'class': 'obj_name'})\
                                .find('h2')
                        if block_title_div is None:
                            continue
                        block_title = block_title_div.text.strip()
                        inner_div = block_div.find('div', attrs={'class': 'WB_innerwrap'})
                        if block_title == u'基本信息':
                            profile_div = inner_div
                        elif block_title == u'工作信息':
                            career_div = inner_div
                        elif block_title == u'教育信息':
                            edu_div = inner_div
                        elif block_title == u'标签信息':
                            tags_div = inner_div
                elif domid == 'Pl_Official_Header__1':
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\
                                                .find('img')['src']
                    weibo_user.info.n_follows = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                    .find('strong', attrs={'node-type': 'follow'}).text)
                    weibo_user.info.n_fans = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                 .find('strong', attrs={'node-type': 'fans'}).text)
                elif domid.startswith('Pl_Core_T8CustomTriColumn__'):
                    # new style friends info
                    header_soup = beautiful_soup(data['html'])
                    tds = header_soup.find('table', attrs={'class': 'tb_counter'})\
                                                .find_all('td')
                    weibo_user.info.n_follows = int(tds[0].find('strong').text)
                    weibo_user.info.n_fans = int(tds[1].find('strong').text)
                elif domid.startswith('Pl_Official_Headerv6__'):
                    # new style avatar info
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('p', attrs='photo_wrap')\
                                                .find('img')['src']
            elif 'STK' in text:
                text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                pid = data['pid']
                if pid == 'pl_profile_infoBase':
                    profile_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoCareer':
                    career_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoEdu':
                    edu_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoTag':
                    tags_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_photo':
                    soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = soup.find('img')['src']
        
        profile_map = {
            u'昵称': {'field': 'nickname'},
            u'所在地': {'field': 'location'},
            u'性别': {'field': 'sex', 
                    'func': lambda s: True if s == u'男' else False},
            u'生日': {'field': 'birth'},
            u'博客': {'field': 'blog'},
            u'个性域名': {'field': 'site'},
            u'简介': {'field': 'intro'},
            u'邮箱': {'field': 'email'},
            u'QQ': {'field': 'qq'},
            u'MSN': {'field': 'msn'}
        }
        if profile_div is not None:
            if not new_style:
                divs = profile_div.find_all(attrs={'class': 'pf_item'})
            else:
                divs = profile_div.find_all('li', attrs={'class': 'li_1'})
            for div in divs:
                if not new_style:
                    k = div.find(attrs={'class': 'label'}).text.strip()
                    v = div.find(attrs={'class': 'con'}).text.strip()
                else:
                    k = div.find('span', attrs={'class': 'pt_title'}).text.strip().strip(u'：')
                    d = div.find('span', attrs={'class': 'pt_detail'})
                    if d:
                        v = d.text.strip()
                    else:
                        v = div.find('a').text.strip()
                if k in profile_map:
                    if k == u'个性域名' and '|' in v:
                        v = v.split('|')[1].strip()
                    func = (lambda s: s) \
                            if 'func' not in profile_map[k] \
                            else profile_map[k]['func']
                    v = func(v)
                    setattr(weibo_user.info, profile_map[k]['field'], v)
                
        weibo_user.info.work = []
        if career_div is not None:
            if not new_style:
                for div in career_div.find_all(attrs={'class': 'con'}):
                    work_info = WorkInfo()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        if a is not None:
                            work_info.name = a.text
                            text = p.text
                            if '(' in text:
                                work_info.date = text.strip().split('(')[1].strip(')')
                        else:
                            text = p.text
                            if text.startswith(u'地区：'):
                                work_info.location = text.split(u'：', 1)[1]
                            elif text.startswith(u'职位：'):
                                work_info.position = text.split(u'：', 1)[1]
                            else:
                                work_info.detail = text
                    weibo_user.info.work.append(work_info)
            else:
                li = career_div.find('li', attrs={'class': 'li_1'})
                for span in li.find_all('span', attrs={'class': 'pt_detail'}):
                    work_info = WorkInfo()
                    
                    text = span.text
                    a = span.find('a')
                    if a is not None:
                        work_info.name = a.text
                    if '(' in text:
                        work_info.date = text.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]

                    for l in text.split('\r\n'):
                        l = l.strip()
                        if len(l) == 0:
                            continue
                        if l.startswith(u'地区：'):
                            work_info.location = l.split(u'：', 1)[1]
                        elif l.startswith(u'职位：'):
                            work_info.position = l.split(u'：', 1)[1]
                        else:
                            work_info.detail = text.replace('\r', '')\
                                                    .replace('\n', '')\
                                                    .replace('\t', '')\
                                                    .strip()
                    
                    weibo_user.info.work.append(work_info)
            
        weibo_user.info.edu = []
        if edu_div is not None:
            if not new_style:
                for div in edu_div.find_all(attrs={'class': 'con'}):
                    edu_info = EduInfo()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        text = p.text
                        if a is not None:
                            edu_info.name = a.text
                            if '(' in text:
                                edu_info.date = text.strip().split('(')[1].strip().strip(')')
                        else:
                            edu_info.detail = text
                    weibo_user.info.edu.append(edu_info)
            else:
                span = edu_div.find('li', attrs={'class': 'li_1'})\
                                .find('span', attrs={'class': 'pt_detail'})
                text = span.text
                names = []
                for a in span.find_all('a'):
                    names.append(a.text)
                
                for idx, name in enumerate(names):
                    start_pos = text.find(name) + len(name)
                    if idx < len(names) - 1:
                        end_pos = text.find(names[idx+1], start_pos)
                    else:
                        end_pos = len(text)
                    t = text[start_pos: end_pos]
                    
                    edu_info = EduInfo()
                    edu_info.name = name
                    if '(' in text:
                        edu_info.date = t.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]
                        t = t[t.find(')')+1:]
                    text = text[end_pos:]
                    edu_info.detail = t.replace('\r', '').replace('\n', '')\
                                        .replace('\t', '').strip()
                    weibo_user.info.edu.append(edu_info)
                    
        weibo_user.info.tags = []
        if tags_div is not None:
            if not new_style:
                for div in tags_div.find_all(attrs={'class': 'con'}):
                    for a in div.find_all('a'):
                        weibo_user.info.tags.append(a.text)
            else:
                for a in tags_div.find('span', attrs={'class': 'pt_detail'}).find_all('a'):
                    weibo_user.info.tags.append(a.text.strip())

        weibo_user.save()
#         self.logger.debug('parse %s finish' % url)

        # counter add one for the profile url
        self.counter.inc('processed_profile_page', 1)

Example #13

0

Show file

File: parsers.py Project: Andelfin/cola

    def parse(self, url=None):
        if self.bundle.exists is False:
            return
        
        url = url or self.url
        br = self.opener.browse_open(url)
        try:
            jsn = json.loads(br.response().read())
        except ValueError:
            raise FetchBannedError('fetch banned by weibo server')

#         self.logger.debug('load %s finish' % url)

        try:
            soup = beautiful_soup(jsn['data']['html'])
            current_page = jsn['data']['page']['pagenum']
            n_pages = jsn['data']['page']['totalpage']
        except KeyError:
            raise FetchBannedError('fetch banned by weibo server')
        
        if not self.check(url, br):
            return
        
        decodes = urldecode(url)
        mid = decodes.get('id', decodes.get('mid'))
        
        mblog = self.bundle.current_mblog
        if mblog is None or mblog.mid != mid:
            try:
                mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
                mblog.save()
        
        def set_instance(instance, dl):
            instance.avatar = dl.find('dt').find('img')['src']
            date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text
            date = date.strip().strip('(').strip(')')
            instance.created = self.parse_datetime(date)
            for div in dl.find_all('div'): div.extract()
            for span in dl.find_all('span'): span.extract()
            instance.content = dl.text.strip()

        counter_type = None
        if url.startswith('http://weibo.com/aj/comment'):
            counter_type = 'comment'
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                uid = dl.find('a', usercard=True)['usercard'].split("id=", 1)[1]
                comment = Comment(uid=uid)
                set_instance(comment, dl)
                
                mblog.comments.append(comment)
        elif url.startswith('http://weibo.com/aj/mblog/info'):
            counter_type = 'forward'
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                forward_again_a = dl.find('a', attrs={'action-type': re.compile("^(feed_list|fl)_forward$")})
                uid = urldecode('?%s' % forward_again_a['action-data'])['uid']
                forward = Forward(uid=uid, mid=dl['mid'])
                set_instance(forward, dl)
                
                mblog.forwards.append(forward)
        elif url.startswith('http://weibo.com/aj/like'):
            counter_type = 'like'
            lis = soup.find_all('li', uid=True)
            for li in lis:
                like = Like(uid=li['uid'])
                like.avatar = li.find('img')['src']
                
                mblog.likes.append(like)

        mblog.save()
#       self.logger.debug('parse %s finish' % url)

        # counter add one for the processed forward or comment or like list url
        if counter_type is not None:
            self.counter.inc('processed_%s_list_page' % counter_type, 1)

        if current_page >= n_pages:
            return
        
        params = urldecode(url)
        new_params = urldecode('?page=%s'%(current_page+1))
        params.update(new_params)
        params['__rnd'] = int(time.time()*1000)
        next_page = '%s?%s' % (url.split('?')[0] , urllib.urlencode(params))
        yield next_page

Example #14

0

Show file

File: parsers.py Project: hkharryking/cola

    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        br = self.opener.browse_open(url)
        jsn = json.loads(br.response().read())

        #         self.logger.debug('load %s finish' % url)

        soup = beautiful_soup(jsn["data"]["html"])
        current_page = jsn["data"]["page"]["pagenum"]
        n_pages = jsn["data"]["page"]["totalpage"]

        if not self.check(url, br):
            return

        decodes = urldecode(url)
        mid = decodes.get("id", decodes.get("mid"))

        mblog = self.bundle.current_mblog
        if mblog is None or mblog.mid != mid:
            try:
                mblog = getattr(MicroBlog, "objects").get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
                mblog.save()

        def set_instance(instance, dl):
            instance.avatar = dl.find("dt").find("img")["src"]
            date = dl.find("dd").find(attrs={"class": "S_txt2"}).text
            date = date.strip().strip("(").strip(")")
            instance.created = self.parse_datetime(date)
            for div in dl.find_all("div"):
                div.extract()
            for span in dl.find_all("span"):
                span.extract()
            instance.content = dl.text.strip()

        counter_type = None
        if url.startswith("http://weibo.com/aj/comment"):
            counter_type = "comment"
            dls = soup.find_all("dl", mid=True)
            for dl in dls:
                uid = dl.find("a", usercard=True)["usercard"].split("id=", 1)[1]
                comment = Comment(uid=uid)
                set_instance(comment, dl)

                mblog.comments.append(comment)
        elif url.startswith("http://weibo.com/aj/mblog/info"):
            counter_type = "forward"
            dls = soup.find_all("dl", mid=True)
            for dl in dls:
                forward_again_a = dl.find("a", attrs={"action-type": re.compile("^(feed_list|fl)_forward$")})
                uid = urldecode("?%s" % forward_again_a["action-data"])["uid"]
                forward = Forward(uid=uid, mid=dl["mid"])
                set_instance(forward, dl)

                mblog.forwards.append(forward)
        elif url.startswith("http://weibo.com/aj/like"):
            counter_type = "like"
            lis = soup.find_all("li", uid=True)
            for li in lis:
                like = Like(uid=li["uid"])
                like.avatar = li.find("img")["src"]

                mblog.likes.append(like)

        mblog.save()
        #       self.logger.debug('parse %s finish' % url)

        # counter add one for the processed forward or comment or like list url
        if counter_type is not None:
            self.counter.inc("processed_%s_list_page" % counter_type, 1)

        if current_page >= n_pages:
            return

        params = urldecode(url)
        new_params = urldecode("?page=%s" % (current_page + 1))
        params.update(new_params)
        params["__rnd"] = int(time.time() * 1000)
        next_page = "%s?%s" % (url.split("?")[0], urllib.urlencode(params))
        yield next_page

Example #15

0

Show file

File: parsers.py Project: Chenxofhit/cola

 def parse(self, url=None):
     if self.bundle.exists == False:
         return [], []
     
     url = url or self.url
     params = urldecode(url)
     br = self.opener.browse_open(url)
     self.logger.debug('load %s finish' % url)
     
     if not self.check(url, br):
         return [], []
         
     weibo_user = self.get_weibo_user()
     
     params['_t'] = 0
     params['__rnd'] = str(int(time.time() * 1000))
     page = int(params.get('page', 1))
     pre_page = int(params.get('pre_page', 0))
     count = 15
     if 'pagebar' not in params:
         params['pagebar'] = '0'
         pre_page += 1
     elif params['pagebar'] == '0':
         params['pagebar'] = '1'
     elif params['pagebar'] == '1':
         del params['pagebar']
         pre_page = page
         page += 1
         count = 50
     params['count'] = count
     params['page'] = page
     params['pre_page'] = pre_page
     
     data = json.loads(br.response().read())['data']
     soup = beautiful_soup(data)
     finished = False
     
     divs = soup.find_all('div', attrs={'class': 'WB_feed_type'},  mid=True)
     max_id = None
     next_urls = []
     for div in divs:
         mid = div['mid']
         if len(mid) == 0:
             continue
         max_id = mid
         
         if 'end_id' not in params:
             params['end_id'] = mid
         if mid in weibo_user.newest_mids:
             finished = True
             break
         if len(self.bundle.newest_mids) < 3:
             self.bundle.newest_mids.append(mid)
         
         try:
             mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid))
         except DoesNotExist:
             mblog = MicroBlog(mid=mid, uid=self.uid)
         content_div = div.find('div', attrs={
             'class': 'WB_text', 
             'node-type': 'feed_list_content'
         })
         for img in content_div.find_all("img", attrs={'type': 'face'}):
             img.replace_with(img['title']);
         mblog.content = content_div.text
         is_forward = div.get('isforward') == '1'
         if is_forward:
             name_a = div.find('a', attrs={
                 'class': 'WB_name', 
                 'node-type': 'feed_list_originNick'
             })
             text_a = div.find('div', attrs={
                 'class': 'WB_text',
                 'node-type': 'feed_list_reason'
             })
             if name_a is not None and text_a is not None:
                 mblog.forward = '%s: %s' % (
                     name_a.text,
                     text_a.text
                 )
         mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title'])
         
         if self.bundle.last_update is None or mblog.created > self.bundle.last_update:
             self.bundle.last_update = mblog.created
         if weibo_user.last_update is not None and \
             mblog.created <= weibo_user.last_update:
             finished = True
             break
         
         likes = div.find('a', attrs={'action-type': 'feed_list_like'}).text
         likes = likes.strip('(').strip(')')
         likes = 0 if len(likes) == 0 else int(likes)
         mblog.n_likes = likes
         forwards = div.find('a', attrs={'action-type': 'feed_list_forward'}).text
         if '(' not in forwards:
             mblog.n_forwards = 0
         else:
             mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')'))
         comments = div.find('a', attrs={'action-type': 'feed_list_comment'}).text
         if '(' not in comments:
             mblog.n_comments = 0
         else:
             mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')'))
             
         # fetch geo info
         map_info = div.find("div", attrs={'class': 'map_data'})
         if map_info is not None:
             geo = Geo()
             geo.location = map_info.text.split('-')[0].strip()
             geo_info = urldecode("?"+map_info.find('a')['action-data'])['geo']
             geo.longtitude, geo.latitude = tuple([float(itm) for itm in geo_info.split(',', 1)])
             mblog.geo = geo
         
         # fetch forwards and comments
         if fetch_forward or fetch_comment or fetch_like:
             query = {'id': mid, '_t': 0, '__rnd': int(time.time()*1000)}
             query_str = urllib.urlencode(query)
             if fetch_forward and mblog.n_forwards > 0:
                 forward_url = 'http://weibo.com/aj/comment/big?%s' % query_str
                 next_urls.append(forward_url)
             if fetch_comment and mblog.n_comments > 0:
                 comment_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str
                 next_urls.append(comment_url)
             if fetch_like and mblog.n_likes > 0:
                 query = {'mid': mid, '_t': 0, '__rnd': int(time.time()*1000)}
                 query_str = urllib.urlencode(query)
                 like_url = 'http://weibo.com/aj/like/big?%s' % query_str
                 next_urls.append(like_url)
         
         mblog.save()
     
     if 'pagebar' in params:
         params['max_id'] = max_id
     else:
         del params['max_id']
     self.logger.debug('parse %s finish' % url)
             
     # if not has next page
     if len(divs) == 0 or finished:
         weibo_user = self.get_weibo_user()
         for mid in self.bundle.newest_mids:
             if mid not in self.bundle.newest_mids:
                 weibo_user.newest_mids.append(mid)
         while len(weibo_user.newest_mids) > 3:
             weibo_user.newest_mids.pop()
         weibo_user.last_update = self.bundle.last_update
         weibo_user.save()
         return [], []
     
     next_urls.append('%s?%s'%(url.split('?')[0], urllib.urlencode(params)))
     return next_urls, []

Example #16

0

Show file

File: parsers.py Project: Chenxofhit/cola

 def parse(self, url=None):
     if self.bundle.exists == False:
         return [], []
     
     url = url or self.url
     br = None
     jsn = None
     try:
         br = self.opener.browse_open(url)
         self.logger.debug('load %s finish' % url)
         jsn = json.loads(br.response().read())
     except (ValueError, URLError) as e:
         return self._error(url, e)
     
     soup = beautiful_soup(jsn['data']['html'])
     current_page = jsn['data']['page']['pagenum']
     n_pages = jsn['data']['page']['totalpage']
     
     if not self.check(url, br):
         return [], []
     
     decodes = urldecode(url)
     mid = decodes.get('id', decodes.get('mid'))
     
     mblog = self.bundle.current_mblog
     if mblog is None or mblog.mid != mid:
         try:
             mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid))
         except DoesNotExist:
             mblog = MicroBlog(mid=mid, uid=self.uid)
             mblog.save()
     
     def set_instance(instance, dl):
         instance.avatar = dl.find('dt').find('img')['src']
         date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text
         date = date.strip().strip('(').strip(')')
         instance.created = self.parse_datetime(date)
         for div in dl.find_all('div'): div.extract()
         for span in dl.find_all('span'): span.extract()
         instance.content = dl.text.strip()
     
     if url.startswith('http://weibo.com/aj/comment'):
         dls = soup.find_all('dl', mid=True)
         for dl in dls:
             comment = Comment(uid=self.uid)
             set_instance(comment, dl)
             
             mblog.comments.append(comment)
     elif url.startswith('http://weibo.com/aj/mblog/info'):
         dls = soup.find_all('dl', mid=True)
         for dl in dls:
             forward = Forward(uid=self.uid, mid=dl['mid'])
             set_instance(forward, dl)
             
             mblog.forwards.append(forward)
     elif url.startswith('http://weibo.com/aj/like'):
         lis = soup.find_all('li', uid=True)
         for li in lis:
             like = Like(uid=li['uid'])
             like.avatar = li.find('img')['src']
             
             mblog.likes.append(like)
     
     try:
         mblog.save()
         self.logger.debug('parse %s finish' % url)
     except ValidationError, e:
         return self._error(url, e)

Example #17

0

Show file

File: parsers.py Project: renchaorevee/cola

    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []

        url = url or self.url
        br = None
        jsn = None
        try:
            br = self.opener.browse_open(url)
            self.logger.debug("load %s finish" % url)
            jsn = json.loads(br.response().read())
        except (ValueError, URLError) as e:
            return self._error(url, e)

        soup = beautiful_soup(jsn["data"]["html"])
        current_page = jsn["data"]["page"]["pagenum"]
        n_pages = jsn["data"]["page"]["totalpage"]

        if not self.check(url, br):
            return [], []

        decodes = urldecode(url)
        mid = decodes.get("id", decodes.get("mid"))

        mblog = self.bundle.current_mblog
        if mblog is None or mblog.mid != mid:
            try:
                mblog = getattr(MicroBlog, "objects").get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
                mblog.save()

        def set_instance(instance, dl):
            instance.avatar = dl.find("dt").find("img")["src"]
            date = dl.find("dd").find("span", attrs={"class": "S_txt2"}).text
            date = date.strip().strip("(").strip(")")
            instance.created = self.parse_datetime(date)
            for div in dl.find_all("div"):
                div.extract()
            for span in dl.find_all("span"):
                span.extract()
            instance.content = dl.text.strip()

        if url.startswith("http://weibo.com/aj/comment"):
            dls = soup.find_all("dl", mid=True)
            for dl in dls:
                comment = Comment(uid=self.uid)
                set_instance(comment, dl)

                mblog.comments.append(comment)
        elif url.startswith("http://weibo.com/aj/mblog/info"):
            dls = soup.find_all("dl", mid=True)
            for dl in dls:
                forward = Forward(uid=self.uid, mid=dl["mid"])
                set_instance(forward, dl)

                mblog.forwards.append(forward)
        elif url.startswith("http://weibo.com/aj/like"):
            lis = soup.find_all("li", uid=True)
            for li in lis:
                like = Like(uid=li["uid"])
                like.avatar = li.find("img")["src"]

                mblog.likes.append(like)

        try:
            mblog.save()
            self.logger.debug("parse %s finish" % url)
        except ValidationError, e:
            return self._error(url, e)

Example #18

0

Show file

File: parsers.py Project: hewayGitHub/2014-cola

                    # info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'})
>>>>>>> origin/master
                    for block_div in info_soup.find_all('div', attrs={'class': 'WB_cardwrap'}):
                        block_title = block_div.find('div', attrs={'class':'WB_cardtitle_b'}).text.strip()
                        if block_title == u'基本信息':
                            profile_div = block_div
                        elif block_title == u'联系信息':
                            relation_div = block_div
                        elif block_title == u'工作信息':
                            career_div = block_div
                        elif block_title == u'教育信息':
                            edu_div = block_div
                        elif block_title == u'标签信息':
                            tags_div = block_div
                elif domid.startswith('Pl_Core_T8CustomTriColumn__'):
                    header_soup = beautiful_soup(data['html'])
<<<<<<< HEAD
                    links = header_soup.find_all('a')
=======
                    links =  header_soup.find_all('a')
>>>>>>> origin/master
                    if len(links) == 3:
                        weibo_user.info.n_follows = int(links[0].find('strong').text)
                        weibo_user.info.n_fans = int(links[1].find('strong').text)
                        weibo_user.info.n_weibos = int(links[2].find('strong').text)
<<<<<<< HEAD
                elif domid.startswith('Pl_Official_RightGrowNew__'):
                    right_soup = beautiful_soup(data['html'])
                    level_div = right_soup.find('div', attrs={'class': 'level_box'})
                    if level_div is not None:
                        for info_span in level_div.find_all(attrs={'class': 'info'}):

Example #19

0

Show file

File: parsers.py Project: chineking/carte

 def parse(self, url=None):
     if self.bundle.exists == False:
         return [], []
     
     url = url or self.url
     params = urldecode(url)
     br = self.opener.browse_open(url)
     self.logger.debug('load %s finish' % url)
     
     if not self.check(url, br):
         return [], []
         
     weibo_user = self.get_weibo_user()
     
     params['_t'] = 0
     params['__rnd'] = str(int(time.time() * 1000))
     page = int(params.get('page', 1))
     pre_page = int(params.get('pre_page', 0))
     count = 15
     if 'pagebar' not in params:
         params['pagebar'] = '0'
         pre_page += 1
     elif params['pagebar'] == '0':
         params['pagebar'] = '1'
     elif params['pagebar'] == '1':
         del params['pagebar']
         pre_page = page
         page += 1
         count = 50
     params['count'] = count
     params['page'] = page
     params['pre_page'] = pre_page
     
     data = json.loads(br.response().read())['data']
     soup = beautiful_soup(data)
     
     divs = soup.find_all('div', attrs={'class': 'WB_feed_type'},  mid=True)
     max_id = None
     next_urls = []
     for div in divs:
         mid = div['mid']
         if len(mid) == 0:
             continue
         max_id = mid
         
         if 'end_id' not in params:
             params['end_id'] = mid
         
         weibo_user.likes.append(mid)
         
     weibo_user.save()
         
     if 'pagebar' in params:
         params['max_id'] = max_id
     else:
         del params['max_id']
     self.logger.debug('parse %s finish' % url)
     
     # if not has next page
     if len(divs) == 0:
         return [], []
     
     next_urls.append('%s?%s'%(url.split('?')[0], urllib.urlencode(params)))
     return next_urls, []

Example #20

0

Show file

File: parsers.py Project: chineking/carte

    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []
        
        url = url or self.url
        params = urldecode(url)
        br = self.opener.browse_open(url)
        self.logger.debug('load %s finish' % url)
        
        if not self.check(url, br):
            return [], []
            
        weibo_user = self.get_weibo_user()
        
        params['_t'] = 0
        params['__rnd'] = str(int(time.time() * 1000))
        page = int(params.get('page', 1))
        pre_page = int(params.get('pre_page', 0))
        count = 15
        if 'pagebar' not in params:
            params['pagebar'] = '0'
            pre_page += 1
        elif params['pagebar'] == '0':
            params['pagebar'] = '1'
        elif params['pagebar'] == '1':
            del params['pagebar']
            pre_page = page
            page += 1
            count = 50
        params['count'] = count
        params['page'] = page
        params['pre_page'] = pre_page
        
        data = json.loads(br.response().read())['data']
        soup = beautiful_soup(data)
        finished = False
        
        divs = soup.find_all('div', attrs={'class': 'WB_feed_type'},  mid=True)
        max_id = None
        next_urls = []
        for div in divs:
            mid = div['mid']
            if len(mid) == 0:
                continue
            max_id = mid
            
            if 'end_id' not in params:
                params['end_id'] = mid
            if mid in weibo_user.newest_mids:
                finished = True
                break
            if len(self.bundle.newest_mids) < 3:
                self.bundle.newest_mids.append(mid)
            
            try:
                mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
            content_div = div.find('div', attrs={
                'class': 'WB_text', 
                'node-type': 'feed_list_content'
            })

            mblog.content = content_div.text
            
            # Links
            for content_a in content_div.find_all('a', 
                attrs={'action-type': 'feed_list_url'}):
                href = content_a['href']
                if href not in mblog.links:
                    mblog.links.append(href)
                    
            # tags
            tags_div = content_div.find('div', attrs={'class': 'wTablist2'})
            if tags_div is not None:
                for tag_a in tags_div.find_all('a'):
                    tag = tag_a.text.strip()
                    if len(tag) > 0 and tag not in mblog.tags:
                        mblog.tags.append(tag)
                    
            is_forward = div.get('isforward') == '1'
            if is_forward:
                mblog.omid = div['omid']
            mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title'])
            
            if self.bundle.last_update is None or mblog.created > self.bundle.last_update:
                self.bundle.last_update = mblog.created
            if weibo_user.last_update is not None and \
                mblog.created <= weibo_user.last_update:
                finished = True
                break
            
            likes = div.find('a', attrs={'action-type': 'feed_list_like'}).text
            likes = likes.strip('(').strip(')')
            likes = 0 if len(likes) == 0 else int(likes)
            mblog.n_likes = likes
            forwards = div.find('a', attrs={'action-type': 'feed_list_forward'}).text
            if '(' not in forwards:
                mblog.n_forwards = 0
            else:
                mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')'))
            comments = div.find('a', attrs={'action-type': 'feed_list_comment'}).text
            if '(' not in comments:
                mblog.n_comments = 0
            else:
                mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')'))
            
            # fetch forwards and comments
            if fetch_forward or fetch_comment or fetch_like:
                query = {'id': mid, '_t': 0, '__rnd': int(time.time()*1000)}
                query_str = urllib.urlencode(query)
                if fetch_forward and mblog.n_forwards > 0:
                    forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str
                    next_urls.append(forward_url)
                if fetch_comment and mblog.n_comments > 0:
                    comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str
                    next_urls.append(comment_url)
                if fetch_like and mblog.n_likes > 0:
                    query = {'mid': mid, '_t': 0, '__rnd': int(time.time()*1000)}
                    query_str = urllib.urlencode(query)
                    like_url = 'http://weibo.com/aj/like/big?%s' % query_str
                    next_urls.append(like_url)
            
            mblog.save()
        
        if 'pagebar' in params:
            params['max_id'] = max_id
        else:
            del params['max_id']
        self.logger.debug('parse %s finish' % url)
                
        # if not has next page
        if len(divs) == 0 or finished:
            weibo_user = self.get_weibo_user()
            for mid in self.bundle.newest_mids:
                if mid not in weibo_user.newest_mids:
                    weibo_user.newest_mids.append(mid)
            while len(weibo_user.newest_mids) > 3:
                weibo_user.newest_mids.pop()
            weibo_user.last_update = self.bundle.last_update
            weibo_user.save()
            return [], []
        
        next_urls.append('%s?%s'%(url.split('?')[0], urllib.urlencode(params)))
        return next_urls, []

Example #21

0

Show file

File: parsers.py Project: keyihao/Weibo_Cola

    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []
        
        url = url or self.url
        br = self.opener.browse_open(url)
        self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(br.response().read())
        
        if not self.check(url, br):
            return [], []
        
        weibo_user = self.get_weibo_user()
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()
            
        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        weibo_ul = None
        rank_div = None
        credit_div = None
        head_pic_div = None
        user_atten_div = None
        for script in soup.find_all('script'):
            text = script.text
            
            if text.startswith('FM.view') and \
               ("Pl_Official_LeftInfo__17" in text \
                or "Pl_Official_Header__1" in text \
                or "Pl_Official_RightGrow__17" in text \
                or "Pl_Official_LeftInfo__36" in text \
                or "Pl_Official_LeftInfo__41" in text \
                or "Pl_Core_Header__1" in text \
                ):
                text = text.replace('FM.view(', '')[:-1]
                if text.endswith(';'):
		    text = text[:-1]

                data = json.loads(text)
                domid = data['domid']
                if domid == 'Pl_Official_LeftInfo__17' or domid == 'Pl_Official_LeftInfo__36'\
                   or domid == 'Pl_Official_LeftInfo__41':
                    info_soup = beautiful_soup(data['html'])
                    info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'})
                    for block_div in info_div.find_all('div', attrs={'class': 'infoblock'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'基本信息':
                            profile_div = block_div
                        elif block_title == u'工作信息':
                            career_div = block_div
                        elif block_title == u'教育信息':
                            edu_div = block_div
                        elif block_title == u'标签信息':
                            tags_div = block_div
                elif domid == 'Pl_Official_RightGrow__17':
                    right_soup = beautiful_soup(data['html'])
                    right_div = right_soup.find('div', attrs={'class': 'prm_app_pinfo'})
                    
                    for block_div in right_div.find_all('div', attrs={'class': 'info_block'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'等级信息':
                            rank_div = block_div
                            
                        elif block_title == u'信用信息':
                            credit_div = block_div
                           
                elif domid == 'Pl_Official_Header__1':
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\
                                                .find('img')['src']
                    weibo_ul = header_soup.find('ul', attrs={'class': 'user_atten clearfix user_atten_s'})

                elif domid == 'Pl_Core_Header__1':
                    core_header_soup = beautiful_soup(data['html'])
                    head_div = core_header_soup.find('div', attrs={'class': 'pf_head S_bg5 S_line1'})
                    head_pic_div = head_div.find('div',attrs={'class': 'pf_head_pic'})
                    user_atten_div = head_div.find('div',attrs={'class': 'user_atten'})
                   
            elif 'STK' in text:
                text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                pid = data['pid']
                if pid == 'pl_profile_infoBase':
                    profile_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoCareer':
                    career_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoEdu':
                    edu_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoTag':
                    tags_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoGrow':
                    right_soup = beautiful_soup(data['html'])
                    right_div = right_soup.find('div', attrs={'class': 'prm_app_pinfo'})
                    for block_div in right_div.find_all('div', attrs={'class': 'info_block'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'等级信息':
                            rank_div = block_div
                        elif block_title == u'信用信息':
                            credit_div = block_div
                elif pid == 'pl_profile_photo':
                    soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = soup.find('img')['src']
                    weibo_ul = soup.find('ul', attrs={'class': 'user_atten clearfix user_atten_m'})
                elif pid == 'pl_leftNav_profilePersonal':
                    if weibo_user.info.avatar is None:
                        soup = beautiful_soup(data['html'])
                        weibo_user.info.avatar = soup.find('div',attrs={'class': 'face_infor'}).find('img')['src']
                        weibo_user.info.nickname = soup.find('div',attrs={'class': 'face_infor'}).find('a',attrs={'class': 'logo_img'})['title']
                elif pid == 'pl_content_litePersonInfo':
                    soup = beautiful_soup(data['html'])
                    weibo_ul = soup.find('ul', attrs={'class': 'user_atten clearfix'})

        profile_map = {
            u'昵称': {'field': 'nickname'},
            u'真实姓名': {'field': 'realname'},
            u'所在地': {'field': 'location'},
            u'性别': {'field': 'sex'},
            u'性取向': {'field': 'sex_dir'},
            u'生日': {'field': 'birth'},
            u'感情状况': {'field': 'love'},
            u'血型': {'field': 'blood_type'},
            u'博客': {'field': 'blog'},
            u'个性域名': {'field': 'site'},
            u'简介': {'field': 'intro'},
            u'邮箱': {'field': 'email'},
            u'QQ': {'field': 'qq'},
            u'MSN': {'field': 'msn'}
        }
        if profile_div is not None:
            for div in profile_div.find_all(attrs={'class': 'pf_item'}):
                k = div.find(attrs={'class': 'label'}).text.strip()
                v = div.find(attrs={'class': 'con'}).text.strip()
                if k in profile_map:
                    if k == u'个性域名' and '|' in v:
                        v = v.split('|')[1].strip()
                    func = (lambda s: s) \
                            if 'func' not in profile_map[k] \
                            else profile_map[k]['func']
                    v = func(v)
                    setattr(weibo_user.info, profile_map[k]['field'], v)

        rank_map = {
	    u'当前等级': {'field': 'rank'},
            u'活跃天数': {'field': 'active_day'},
	}
        if rank_div is not None:
            for div in rank_div.find_all(attrs={'class': 'info'}):
                k = div.text.strip()[:4]
                v = div.find(attrs={'class': 'S_txt1 point'}).text.strip('LV')
                if k in rank_map:
                    func = (lambda s: s) \
                            if 'func' not in rank_map[k] \
                            else rank_map[k]['func']
                    v = func(v)
                    setattr(weibo_user.info, rank_map[k]['field'], v)

        credit_map = {
	    u'信用等级': {'field': 'credit_rank'},
            u'当前信用积分': {'field': 'credit'},
	}
        if credit_div is not None:
            for div in credit_div.find_all(attrs={'class': 'info'}):
                if u'信用等级' in div.text.strip():
                    k = div.text.strip()[:4]
                    v = div.find(attrs={'class': 'S_txt1'}).text.strip()
                else:
                    k = div.text.strip()[:6]
                    v = div.find(attrs={'class': 'S_txt1 point'}).text.strip()
                if k in credit_map:
                    func = (lambda s: s) \
                            if 'func' not in credit_map[k] \
                            else credit_map[k]['func']
                    v = func(v)
                    setattr(weibo_user.info, credit_map[k]['field'], v)

        weibo_map = {
	    u'关注': {'field': 'follow_num'},
            u'粉丝': {'field': 'fans_num'},
            u'微博': {'field': 'weibo_num'},
	}
        if weibo_ul is not None:
            for li in weibo_ul.find_all('li'):
                k = li.find('span').text.strip()
                v = li.find('strong').text.strip()
                if k in weibo_map:
                    func = (lambda s: s) \
                            if 'func' not in weibo_map[k] \
                            else weibo_map[k]['func']
                    v = func(v)
                    setattr(weibo_user.info, weibo_map[k]['field'], v)

        weibo_user.info.work = []
        if career_div is not None:
            for div in career_div.find_all(attrs={'class': 'con'}):
                work_info = WorkInfo()
                ps = div.find_all('p')
                for p in ps:
                    a = p.find('a')
                    if a is not None:
                        work_info.name = a.text
                        text = p.text
                        if '(' in text:
                            work_info.date = text.strip().split('(')[1].strip(')')
                    else:
                        text = p.text
                        if text.startswith(u'地区：'):
                            work_info.location = text.split(u'：', 1)[1]
                        elif text.startswith(u'职位：'):
                            work_info.position = text.split(u'：', 1)[1]
                        else:
                            work_info.detail = text
                weibo_user.info.work.append(work_info)
            
        weibo_user.info.edu = []
        if edu_div is not None:
            for div in edu_div.find_all(attrs={'class': 'con'}):
                edu_info = EduInfo()
                ps = div.find_all('p')
                for p in ps:
                    a = p.find('a')
                    text = p.text
                    if a is not None:
                        edu_info.name = a.text
                        if '(' in text:
                            edu_info.date = text.strip().split('(')[1].strip(')')
                    else:
                        edu_info.detail = text
                weibo_user.info.edu.append(edu_info)
                    
        weibo_user.info.tags = []
        if tags_div is not None:
            for div in tags_div.find_all(attrs={'class': 'con'}):
                for a in div.find_all('a'):
                    weibo_user.info.tags.append(a.text)

        if head_pic_div is not None and weibo_user.info.avatar is None:
            weibo_user.info.avatar = head_pic_div.find('img')['src']
            weibo_user.info.nickname = head_pic_div.find('img')['title']
            
        if weibo_ul is None and user_atten_div is not None:
            for td in user_atten_div.find_all('td'):
                k = td.find('span').text.strip()
                v = td.find('strong').text.strip()
                if k in weibo_map:
                    func = (lambda s: s) \
                            if 'func' not in weibo_map[k] \
                            else weibo_map[k]['func']
                    v = func(v)
                    setattr(weibo_user.info, weibo_map[k]['field'], v)
                
        weibo_user.save()
        self.logger.debug('parse %s finish' % url)
        return [], []

Example #22

0

Show file

File: parsers.py Project: winater/SimpleEarlyWarningSystem

    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []
        
        url = url or self.url
        params = urldecode(url)
        br = self.opener.browse_open(url)
        self.logger.debug('load %s finish' % url)
        
        if not self.check(url, br):
            return [], []
            
        weibo_user = self.get_weibo_user()
        
        params['_t'] = 0
        params['__rnd'] = str(int(time.time() * 1000))
        page = int(params.get('page', 1))
        pre_page = int(params.get('pre_page', 0))
        count = 15
        if 'pagebar' not in params:
            params['pagebar'] = '0'
            pre_page += 1
        elif params['pagebar'] == '0':
            params['pagebar'] = '1'
        elif params['pagebar'] == '1':
            del params['pagebar']
            pre_page = page
            page += 1
            count = 50
        params['count'] = count
        params['page'] = page
        params['pre_page'] = pre_page
        
        data = json.loads(br.response().read())['data']
        soup = beautiful_soup(data)
        finished = False
        
        divs = soup.find_all('div', attrs={'class': 'WB_feed_type'},  mid=True)
        max_id = None
        next_urls = []
        for div in divs:
            mid = div['mid']
            if len(mid) == 0:
                continue
            max_id = mid
            
            if 'end_id' not in params:
                params['end_id'] = mid
            if mid in weibo_user.newest_mids:
                finished = True
                break
            if len(self.bundle.newest_mids) < 3:
                self.bundle.newest_mids.append(mid)
            
            try:
                mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
            content_div = div.find('div', attrs={
                'class': 'WB_text', 
                'node-type': 'feed_list_content'
            })
            for img in content_div.find_all("img", attrs={'type': 'face'}):
                img.replace_with(img['title']);
            mblog.content = content_div.text
            is_forward = div.get('isforward') == '1'
            if is_forward:
                mblog.omid = div['omid']
                name_a = div.find('a', attrs={
                    'class': 'WB_name', 
                    'node-type': 'feed_list_originNick'
                })
                text_a = div.find('div', attrs={
                    'class': 'WB_text',
                    'node-type': 'feed_list_reason'
                })
                if name_a is not None and text_a is not None:
                    mblog.forward = '%s: %s' % (
                        name_a.text,
                        text_a.text
                    )
            #mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title'])
            #ci
            #
            temp = parse(div.select('a.S_link2.WB_time')[0]['title'])
            tempstring = temp.strftime("%Y-%m-%d-%H-%M-%S")
            list=tempstring.split('-')
            tempyear=list[0]
            tempmonth=list[1]
            tempday=list[2]
            temphour=list[3]
            tempmin=list[4]
            tempsec=list[5]
            temptime=time.mktime(datetime(int(tempyear),int(tempmonth),int(tempday),int(temphour),int(tempmin),int(tempsec)).timetuple())
            print temptime
            
            timevalue=open("D:\\09Limited_buffer\\earlywarningbyci\\cola\\contrib\\weibo\\timevalue.txt","r")
            time_re=timevalue.readline()
            timevalue.close()
            list=time_re.split()
            starttime=list[0]
            endtime=list[1]
            print starttime
            temptime=round(float(temptime))
            starttime=round(float(starttime))
            endtime=round(float(endtime))
            if temptime>=starttime and temptime<=endtime:
                mblog.created = temp
                #timeok = True
                print "------OKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOK-----"
            else:
                if temptime<starttime:
                    print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
                    time.sleep(5)
                    return [], []
                #continue
            #
            # 
            if self.bundle.last_update is None or mblog.created > self.bundle.last_update:
                self.bundle.last_update = mblog.created
            if weibo_user.last_update is not None and \
                mblog.created <= weibo_user.last_update:
                finished = True
                break

            func_div = div.find_all('div', 'WB_func')[-1]
            action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t)
            
            likes = func_div.find('a', attrs={'action-type': action_type_re("like")}).text
            likes = likes.strip('(').strip(')')
            likes = 0 if len(likes) == 0 else int(likes)
            mblog.n_likes = likes
            forwards = func_div.find('a', attrs={'action-type': action_type_re("forward")}).text
            if '(' not in forwards:
                mblog.n_forwards = 0
            else:
                mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')'))
            comments = func_div.find('a', attrs={'action-type': action_type_re('comment')}).text
            if '(' not in comments:
                mblog.n_comments = 0
            else:
                mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')'))
                
            # fetch geo info
            map_info = div.find("div", attrs={'class': 'map_data'})
            if map_info is not None:
                geo = Geo()
                geo.location = map_info.text.split('-')[0].strip()
                geo_info = urldecode("?"+map_info.find('a')['action-data'])['geo']
                geo.longtitude, geo.latitude = tuple([float(itm) for itm in geo_info.split(',', 1)])
                mblog.geo = geo
            
            # fetch forwards and comments
            if fetch_forward or fetch_comment or fetch_like:
                query = {'id': mid, '_t': 0, '__rnd': int(time.time()*1000)}
                query_str = urllib.urlencode(query)
                if fetch_forward and mblog.n_forwards > 0:
                    forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str
                    next_urls.append(forward_url)
                if fetch_comment and mblog.n_comments > 0:
                    comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str
                    next_urls.append(comment_url)
                if fetch_like and mblog.n_likes > 0:
                    query = {'mid': mid, '_t': 0, '__rnd': int(time.time()*1000)}
                    query_str = urllib.urlencode(query)
                    like_url = 'http://weibo.com/aj/like/big?%s' % query_str
                    next_urls.append(like_url)
            
            mblog.save()
        
        if 'pagebar' in params:
            params['max_id'] = max_id
        else:
            del params['max_id']
        self.logger.debug('parse %s finish' % url)
                
        # if not has next page
        if len(divs) == 0 or finished:
            weibo_user = self.get_weibo_user()
            for mid in self.bundle.newest_mids:
                if mid not in weibo_user.newest_mids:
                    weibo_user.newest_mids.append(mid)
            while len(weibo_user.newest_mids) > 3:
                weibo_user.newest_mids.pop()
            weibo_user.last_update = self.bundle.last_update
            weibo_user.save()
            return [], []
        
        next_urls.append('%s?%s'%(url.split('?')[0], urllib.urlencode(params)))
        return next_urls, []

Example #23

0

Show file

    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        params = urldecode(url)
        br = self.opener.browse_open(url)
        #         self.logger.debug('load %s finish' % url)

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()

        params['_t'] = 0
        params['__rnd'] = str(int(time.time() * 1000))
        page = int(params.get('page', 1))
        pre_page = int(params.get('pre_page', 0))
        count = 15
        if 'pagebar' not in params:
            params['pagebar'] = '0'
            pre_page += 1
        elif params['pagebar'] == '0':
            params['pagebar'] = '1'
        elif params['pagebar'] == '1':
            del params['pagebar']
            pre_page = page
            page += 1
            count = 50
        params['count'] = count
        params['page'] = page
        params['pre_page'] = pre_page

        data = json.loads(br.response().read())['data']
        soup = beautiful_soup(data)
        finished = False

        divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True)
        max_id = None
        for div in divs:
            mid = div['mid']
            if len(mid) == 0:
                continue
            max_id = mid

            if 'end_id' not in params:
                params['end_id'] = mid
            if mid in weibo_user.newest_mids:
                finished = True
                break
            if len(self.bundle.newest_mids) < 3:
                self.bundle.newest_mids.append(mid)

            try:
                mblog = getattr(MicroBlog,
                                'objects').get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
            content_div = div.find('div',
                                   attrs={
                                       'class': 'WB_text',
                                       'node-type': 'feed_list_content'
                                   })
            for img in content_div.find_all("img", attrs={'type': 'face'}):
                img.replace_with(img['title'])
            mblog.content = content_div.text
            is_forward = div.get('isforward') == '1'
            if is_forward:
                mblog.omid = div['omid']
                name_a = div.find('a',
                                  attrs={
                                      'class': 'WB_name',
                                      'node-type': 'feed_list_originNick'
                                  })
                text_a = div.find('div',
                                  attrs={
                                      'class': 'WB_text',
                                      'node-type': 'feed_list_reason'
                                  })
                if name_a is not None and text_a is not None:
                    mblog.forward = '%s: %s' % (name_a.text, text_a.text)
            mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title'])

            if self.bundle.last_update is None or mblog.created > self.bundle.last_update:
                self.bundle.last_update = mblog.created
            if weibo_user.last_update is not None and \
                mblog.created <= weibo_user.last_update:
                finished = True
                break

            func_div = div.find_all('div', 'WB_func')[-1]
            action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t)

            likes = func_div.find('a',
                                  attrs={
                                      'action-type': action_type_re("like")
                                  }).text
            likes = likes.strip('(').strip(')')
            likes = 0 if len(likes) == 0 else int(likes)
            mblog.n_likes = likes
            forwards = func_div.find('a',
                                     attrs={
                                         'action-type':
                                         action_type_re("forward")
                                     }).text
            if '(' not in forwards:
                mblog.n_forwards = 0
            else:
                mblog.n_forwards = int(forwards.strip().split('(',
                                                              1)[1].strip(')'))
            comments = func_div.find('a',
                                     attrs={
                                         'action-type':
                                         action_type_re('comment')
                                     }).text
            if '(' not in comments:
                mblog.n_comments = 0
            else:
                mblog.n_comments = int(comments.strip().split('(',
                                                              1)[1].strip(')'))

            # fetch geo info
            map_info = div.find("div", attrs={'class': 'map_data'})
            if map_info is not None:
                geo = Geo()
                geo.location = map_info.text.split('-')[0].strip()
                geo_info = urldecode("?" +
                                     map_info.find('a')['action-data'])['geo']
                geo.longtitude, geo.latitude = tuple(
                    [float(itm) for itm in geo_info.split(',', 1)])
                mblog.geo = geo

            # fetch forwards and comments
            if fetch_forward or fetch_comment or fetch_like:
                query = {'id': mid, '_t': 0, '__rnd': int(time.time() * 1000)}
                query_str = urllib.urlencode(query)
                if fetch_forward and mblog.n_forwards > 0:
                    forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str
                    yield forward_url
                if fetch_comment and mblog.n_comments > 0:
                    comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str
                    yield comment_url
                if fetch_like and mblog.n_likes > 0:
                    query = {
                        'mid': mid,
                        '_t': 0,
                        '__rnd': int(time.time() * 1000)
                    }
                    query_str = urllib.urlencode(query)
                    like_url = 'http://weibo.com/aj/like/big?%s' % query_str
                    yield like_url

            mblog.save()

        if 'pagebar' in params:
            params['max_id'] = max_id
        else:
            del params['max_id']


#         self.logger.debug('parse %s finish' % url)

# counter add one for the processed weibo list url
        self.counter.inc('processed_weibo_list_page', 1)

        # if not has next page
        if len(divs) == 0 or finished:
            weibo_user = self.get_weibo_user()
            for mid in self.bundle.newest_mids:
                if mid not in weibo_user.newest_mids:
                    weibo_user.newest_mids.append(mid)
            while len(weibo_user.newest_mids) > 3:
                weibo_user.newest_mids.pop()
            weibo_user.last_update = self.bundle.last_update
            weibo_user.save()
            return

        yield '%s?%s' % (url.split('?')[0], urllib.urlencode(params))

Example #24

0

Show file

    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        br = self.opener.browse_open(url)
        #         self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(br.response().read())

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()

        new_style = False

        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                text = text.strip().replace(';', '').replace('FM.view(',
                                                             '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid.startswith('Pl_Official_LeftInfo__'):
                    info_soup = beautiful_soup(data['html'])
                    info_div = info_soup.find('div',
                                              attrs={'class': 'profile_pinfo'})
                    for block_div in info_div.find_all(
                            'div', attrs={'class': 'infoblock'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'基本信息':
                            profile_div = block_div
                        elif block_title == u'工作信息':
                            career_div = block_div
                        elif block_title == u'教育信息':
                            edu_div = block_div
                        elif block_title == u'标签信息':
                            tags_div = block_div
                elif domid.startswith('Pl_Official_PersonalInfo__'):
                    new_style = True
                    info_soup = beautiful_soup(data['html'])
                    for block_div in info_soup.find_all(
                            'div', attrs={'class': 'WB_cardwrap'}):
                        block_title_div = block_div.find(
                            'h4', attrs={'class': 'obj_name'})
                        if block_title_div is None:
                            block_title_div = block_div.find('div', attrs={'class': 'obj_name'})\
                                .find('h2')
                        if block_title_div is None:
                            continue
                        block_title = block_title_div.text.strip()
                        inner_div = block_div.find(
                            'div', attrs={'class': 'WB_innerwrap'})
                        if block_title == u'基本信息':
                            profile_div = inner_div
                        elif block_title == u'工作信息':
                            career_div = inner_div
                        elif block_title == u'教育信息':
                            edu_div = inner_div
                        elif block_title == u'标签信息':
                            tags_div = inner_div
                elif domid == 'Pl_Official_Header__1':
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\
                                                .find('img')['src']
                    weibo_user.info.n_follows = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                    .find('strong', attrs={'node-type': 'follow'}).text)
                    weibo_user.info.n_fans = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                 .find('strong', attrs={'node-type': 'fans'}).text)
                elif domid.startswith('Pl_Core_T8CustomTriColumn__'):
                    # new style friends info
                    header_soup = beautiful_soup(data['html'])
                    tds = header_soup.find('table', attrs={'class': 'tb_counter'})\
                                                .find_all('td')
                    weibo_user.info.n_follows = int(tds[0].find('strong').text)
                    weibo_user.info.n_fans = int(tds[1].find('strong').text)
                elif domid.startswith('Pl_Official_Headerv6__'):
                    # new style avatar info
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('p', attrs='photo_wrap')\
                                                .find('img')['src']
            elif 'STK' in text:
                text = text.replace(
                    'STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                pid = data['pid']
                if pid == 'pl_profile_infoBase':
                    profile_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoCareer':
                    career_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoEdu':
                    edu_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoTag':
                    tags_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_photo':
                    soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = soup.find('img')['src']

        profile_map = {
            u'昵称': {
                'field': 'nickname'
            },
            u'所在地': {
                'field': 'location'
            },
            u'性别': {
                'field': 'sex',
                'func': lambda s: True if s == u'男' else False
            },
            u'生日': {
                'field': 'birth'
            },
            u'博客': {
                'field': 'blog'
            },
            u'个性域名': {
                'field': 'site'
            },
            u'简介': {
                'field': 'intro'
            },
            u'邮箱': {
                'field': 'email'
            },
            u'QQ': {
                'field': 'qq'
            },
            u'MSN': {
                'field': 'msn'
            }
        }
        if profile_div is not None:
            if not new_style:
                divs = profile_div.find_all(attrs={'class': 'pf_item'})
            else:
                divs = profile_div.find_all('li', attrs={'class': 'li_1'})
            for div in divs:
                if not new_style:
                    k = div.find(attrs={'class': 'label'}).text.strip()
                    v = div.find(attrs={'class': 'con'}).text.strip()
                else:
                    k = div.find('span', attrs={
                        'class': 'pt_title'
                    }).text.strip().strip(u'：')
                    d = div.find('span', attrs={'class': 'pt_detail'})
                    if d:
                        v = d.text.strip()
                    else:
                        v = div.find('a').text.strip()
                if k in profile_map:
                    if k == u'个性域名' and '|' in v:
                        v = v.split('|')[1].strip()
                    func = (lambda s: s) \
                            if 'func' not in profile_map[k] \
                            else profile_map[k]['func']
                    v = func(v)
                    setattr(weibo_user.info, profile_map[k]['field'], v)

        weibo_user.info.work = []
        if career_div is not None:
            if not new_style:
                for div in career_div.find_all(attrs={'class': 'con'}):
                    work_info = WorkInfo()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        if a is not None:
                            work_info.name = a.text
                            text = p.text
                            if '(' in text:
                                work_info.date = text.strip().split(
                                    '(')[1].strip(')')
                        else:
                            text = p.text
                            if text.startswith(u'地区：'):
                                work_info.location = text.split(u'：', 1)[1]
                            elif text.startswith(u'职位：'):
                                work_info.position = text.split(u'：', 1)[1]
                            else:
                                work_info.detail = text
                    weibo_user.info.work.append(work_info)
            else:
                li = career_div.find('li', attrs={'class': 'li_1'})
                for span in li.find_all('span', attrs={'class': 'pt_detail'}):
                    work_info = WorkInfo()

                    text = span.text
                    a = span.find('a')
                    if a is not None:
                        work_info.name = a.text
                    if '(' in text:
                        work_info.date = text.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]

                    for l in text.split('\r\n'):
                        l = l.strip()
                        if len(l) == 0:
                            continue
                        if l.startswith(u'地区：'):
                            work_info.location = l.split(u'：', 1)[1]
                        elif l.startswith(u'职位：'):
                            work_info.position = l.split(u'：', 1)[1]
                        else:
                            work_info.detail = text.replace('\r', '')\
                                                    .replace('\n', '')\
                                                    .replace('\t', '')\
                                                    .strip()

                    weibo_user.info.work.append(work_info)

        weibo_user.info.edu = []
        if edu_div is not None:
            if not new_style:
                for div in edu_div.find_all(attrs={'class': 'con'}):
                    edu_info = EduInfo()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        text = p.text
                        if a is not None:
                            edu_info.name = a.text
                            if '(' in text:
                                edu_info.date = text.strip().split(
                                    '(')[1].strip().strip(')')
                        else:
                            edu_info.detail = text
                    weibo_user.info.edu.append(edu_info)
            else:
                span = edu_div.find('li', attrs={'class': 'li_1'})\
                                .find('span', attrs={'class': 'pt_detail'})
                text = span.text
                names = []
                for a in span.find_all('a'):
                    names.append(a.text)

                for idx, name in enumerate(names):
                    start_pos = text.find(name) + len(name)
                    if idx < len(names) - 1:
                        end_pos = text.find(names[idx + 1], start_pos)
                    else:
                        end_pos = len(text)
                    t = text[start_pos:end_pos]

                    edu_info = EduInfo()
                    edu_info.name = name
                    if '(' in text:
                        edu_info.date = t.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]
                        t = t[t.find(')') + 1:]
                    text = text[end_pos:]
                    edu_info.detail = t.replace('\r', '').replace('\n', '')\
                                        .replace('\t', '').strip()
                    weibo_user.info.edu.append(edu_info)

        weibo_user.info.tags = []
        if tags_div is not None:
            if not new_style:
                for div in tags_div.find_all(attrs={'class': 'con'}):
                    for a in div.find_all('a'):
                        weibo_user.info.tags.append(a.text)
            else:
                for a in tags_div.find('span', attrs={
                        'class': 'pt_detail'
                }).find_all('a'):
                    weibo_user.info.tags.append(a.text.strip())

        weibo_user.save()
        #         self.logger.debug('parse %s finish' % url)

        # counter add one for the profile url
        self.counter.inc('processed_profile_page', 1)

Example #25

0

Show file

File: parsers.py Project: Chenxofhit/cola

 def parse(self, url=None):
     if self.bundle.exists == False:
         return [], []
     
     url = url or self.url
     br = self.opener.browse_open(url)
     self.logger.debug('load %s finish' % url)
     soup = beautiful_soup(br.response().read())
     
     if not self.check(url, br):
         return [], []
     
     weibo_user = self.get_weibo_user()
     info = weibo_user.info
     if info is None:
         weibo_user.info = UserInfo()
         
     profile_div = None
     career_div = None
     edu_div = None
     tags_div = None
     for script in soup.find_all('script'):
         text = script.text
         if text.startswith('FM.view'):
             text = text.strip().replace(';', '').replace('FM.view(', '')[:-1]
             data = json.loads(text)
             domid = data['domid']
             if domid == 'Pl_Official_LeftInfo__13':
                 info_soup = beautiful_soup(data['html'])
                 info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'})
                 for block_div in info_div.find_all('div', attrs={'class': 'infoblock'}):
                     block_title = block_div.find('form').text.strip()
                     if block_title == u'基本信息':
                         profile_div = block_div
                     elif block_title == u'工作信息':
                         career_div = block_div
                     elif block_title == u'教育信息':
                         edu_div = block_div
                     elif block_title == u'标签信息':
                         tags_div = block_div
             elif domid == 'Pl_Official_Header__1':
                 header_soup = beautiful_soup(data['html'])
                 weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\
                                             .find('img')['src']
         elif 'STK' in text:
             text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
             data = json.loads(text)
             pid = data['pid']
             if pid == 'pl_profile_infoBase':
                 profile_div = beautiful_soup(data['html'])
             elif pid == 'pl_profile_infoCareer':
                 career_div = beautiful_soup(data['html'])
             elif pid == 'pl_profile_infoEdu':
                 edu_div = beautiful_soup(data['html'])
             elif pid == 'pl_profile_infoTag':
                 tags_div = beautiful_soup(data['html'])
             elif pid == 'pl_profile_photo':
                 soup = beautiful_soup(data['html'])
                 weibo_user.info.avatar = soup.find('img')['src']
     
     profile_map = {
         u'昵称': {'field': 'nickname'},
         u'所在地': {'field': 'location'},
         u'性别': {'field': 'sex', 
                 'func': lambda s: True if s == u'男' else False},
         u'生日': {'field': 'birth'},
         u'博客': {'field': 'blog'},
         u'个性域名': {'field': 'site'},
         u'简介': {'field': 'intro'},
         u'邮箱': {'field': 'email'},
         u'QQ': {'field': 'qq'},
         u'MSN': {'field': 'msn'}
     }
     if profile_div is not None:
         for div in profile_div.find_all(attrs={'class': 'pf_item'}):
             k = div.find(attrs={'class': 'label'}).text.strip()
             v = div.find(attrs={'class': 'con'}).text.strip()
             if k in profile_map:
                 if k == u'个性域名' and '|' in v:
                     v = v.split('|')[1].strip()
                 func = (lambda s: s) \
                         if 'func' not in profile_map[k] \
                         else profile_map[k]['func']
                 v = func(v)
                 setattr(weibo_user.info, profile_map[k]['field'], v)
             
     weibo_user.info.work = []
     if career_div is not None:
         for div in career_div.find_all(attrs={'class': 'con'}):
             work_info = WorkInfo()
             ps = div.find_all('p')
             for p in ps:
                 a = p.find('a')
                 if a is not None:
                     work_info.name = a.text
                     text = p.text
                     if '(' in text:
                         work_info.date = text.strip().split('(')[1].strip(')')
                 else:
                     text = p.text
                     if text.startswith(u'地区：'):
                         work_info.location = text.split(u'：', 1)[1]
                     elif text.startswith(u'职位：'):
                         work_info.position = text.split(u'：', 1)[1]
                     else:
                         work_info.detail = text
             weibo_user.info.work.append(work_info)
         
     weibo_user.info.edu = []
     if edu_div is not None:
         for div in edu_div.find_all(attrs={'class': 'con'}):
             edu_info = EduInfo()
             ps = div.find_all('p')
             for p in ps:
                 a = p.find('a')
                 text = p.text
                 if a is not None:
                     edu_info.name = a.text
                     if '(' in text:
                         edu_info.date = text.strip().split('(')[1].strip(')')
                 else:
                     edu_info.detail = text
             weibo_user.info.edu.append(edu_info)
                 
     weibo_user.info.tags = []
     if tags_div is not None:
         for div in tags_div.find_all(attrs={'class': 'con'}):
             for a in div.find_all('a'):
                 weibo_user.info.tags.append(a.text)
             
     weibo_user.save()
     self.logger.debug('parse %s finish' % url)
     return [], []

Example #26

0

Show file

File: preprocess.py Project: 0pengl/cola

 def get_body(self, soup):
     for elem in soup.find_all(['script', 'link', 'style']):
         elem.extract()
     raw_html = unicode(soup.body or soup)
     cleaned = self._clean_attributes(raw_html)
     return beautiful_soup(cleaned)

Example #27

0

Show file

File: parsers.py Project: renchaorevee/cola

    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []

        url = url or self.url
        params = urldecode(url)
        br = self.opener.browse_open(url)
        self.logger.debug("load %s finish" % url)

        if not self.check(url, br):
            return [], []

        weibo_user = self.get_weibo_user()

        params["_t"] = 0
        params["__rnd"] = str(int(time.time() * 1000))
        page = int(params.get("page", 1))
        pre_page = int(params.get("pre_page", 0))
        count = 15
        if "pagebar" not in params:
            params["pagebar"] = "0"
            pre_page += 1
        elif params["pagebar"] == "0":
            params["pagebar"] = "1"
        elif params["pagebar"] == "1":
            del params["pagebar"]
            pre_page = page
            page += 1
            count = 50
        params["count"] = count
        params["page"] = page
        params["pre_page"] = pre_page

        data = json.loads(br.response().read())["data"]
        soup = beautiful_soup(data)
        finished = False

        divs = soup.find_all("div", attrs={"class": "WB_feed_type"}, mid=True)
        max_id = None
        next_urls = []
        for div in divs:
            mid = div["mid"]
            if len(mid) == 0:
                continue
            max_id = mid

            if "end_id" not in params:
                params["end_id"] = mid
            if mid in weibo_user.newest_mids:
                finished = True
                break
            if len(self.bundle.newest_mids) < 3:
                self.bundle.newest_mids.append(mid)

            try:
                mblog = getattr(MicroBlog, "objects").get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
            content_div = div.find("div", attrs={"class": "WB_text", "node-type": "feed_list_content"})
            for img in content_div.find_all("img", attrs={"type": "face"}):
                img.replace_with(img["title"])
            mblog.content = content_div.text
            is_forward = div.get("isforward") == "1"
            if is_forward:
                name_a = div.find("a", attrs={"class": "WB_name", "node-type": "feed_list_originNick"})
                text_a = div.find("div", attrs={"class": "WB_text", "node-type": "feed_list_reason"})
                if name_a is not None and text_a is not None:
                    mblog.forward = "%s: %s" % (name_a.text, text_a.text)
            mblog.created = parse(div.select("a.S_link2.WB_time")[0]["title"])

            if self.bundle.last_update is None or mblog.created > self.bundle.last_update:
                self.bundle.last_update = mblog.created
            if weibo_user.last_update is not None and mblog.created <= weibo_user.last_update:
                finished = True
                break

            likes = div.find("a", attrs={"action-type": "feed_list_like"}).text
            likes = likes.strip("(").strip(")")
            likes = 0 if len(likes) == 0 else int(likes)
            mblog.n_likes = likes
            forwards = div.find("a", attrs={"action-type": "feed_list_forward"}).text
            if "(" not in forwards:
                mblog.n_forwards = 0
            else:
                mblog.n_forwards = int(forwards.strip().split("(", 1)[1].strip(")"))
            comments = div.find("a", attrs={"action-type": "feed_list_comment"}).text
            if "(" not in comments:
                mblog.n_comments = 0
            else:
                mblog.n_comments = int(comments.strip().split("(", 1)[1].strip(")"))

            # fetch geo info
            map_info = div.find("div", attrs={"class": "map_data"})
            if map_info is not None:
                geo = Geo()
                geo.location = map_info.text.split("-")[0].strip()
                geo_info = urldecode("?" + map_info.find("a")["action-data"])["geo"]
                geo.longtitude, geo.latitude = tuple([float(itm) for itm in geo_info.split(",", 1)])
                mblog.geo = geo

            # fetch forwards and comments
            if fetch_forward or fetch_comment or fetch_like:
                query = {"id": mid, "_t": 0, "__rnd": int(time.time() * 1000)}
                query_str = urllib.urlencode(query)
                if fetch_forward and mblog.n_forwards > 0:
                    forward_url = "http://weibo.com/aj/comment/big?%s" % query_str
                    next_urls.append(forward_url)
                if fetch_comment and mblog.n_comments > 0:
                    comment_url = "http://weibo.com/aj/mblog/info/big?%s" % query_str
                    next_urls.append(comment_url)
                if fetch_like and mblog.n_likes > 0:
                    query = {"mid": mid, "_t": 0, "__rnd": int(time.time() * 1000)}
                    query_str = urllib.urlencode(query)
                    like_url = "http://weibo.com/aj/like/big?%s" % query_str
                    next_urls.append(like_url)

            mblog.save()

        if "pagebar" in params:
            params["max_id"] = max_id
        else:
            del params["max_id"]
        self.logger.debug("parse %s finish" % url)

        # if not has next page
        if len(divs) == 0 or finished:
            weibo_user = self.get_weibo_user()
            for mid in self.bundle.newest_mids:
                if mid not in self.bundle.newest_mids:
                    weibo_user.newest_mids.append(mid)
            while len(weibo_user.newest_mids) > 3:
                weibo_user.newest_mids.pop()
            weibo_user.last_update = self.bundle.last_update
            weibo_user.save()
            return [], []

        next_urls.append("%s?%s" % (url.split("?")[0], urllib.urlencode(params)))
        return next_urls, []

Example #28

0

Show file

File: parsers.py Project: hkharryking/cola

    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        br = self.opener.browse_open(url)
        #         self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(br.response().read())

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()

        new_style = False

        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all("script"):
            text = script.text
            if text.startswith("FM.view"):
                text = text.strip().replace(";", "").replace("FM.view(", "")[:-1]
                data = json.loads(text)
                domid = data["domid"]
                if domid.startswith("Pl_Official_LeftInfo__"):
                    info_soup = beautiful_soup(data["html"])
                    info_div = info_soup.find("div", attrs={"class": "profile_pinfo"})
                    for block_div in info_div.find_all("div", attrs={"class": "infoblock"}):
                        block_title = block_div.find("form").text.strip()
                        if block_title == u"基本信息":
                            profile_div = block_div
                        elif block_title == u"工作信息":
                            career_div = block_div
                        elif block_title == u"教育信息":
                            edu_div = block_div
                        elif block_title == u"标签信息":
                            tags_div = block_div
                elif domid.startswith("Pl_Official_PersonalInfo__"):
                    new_style = True
                    info_soup = beautiful_soup(data["html"])
                    for block_div in info_soup.find_all("div", attrs={"class": "WB_cardwrap"}):
                        block_title_div = block_div.find("h4", attrs={"class": "obj_name"})
                        if block_title_div is None:
                            block_title_div = block_div.find("div", attrs={"class": "obj_name"}).find("h2")
                        if block_title_div is None:
                            continue
                        block_title = block_title_div.text.strip()
                        inner_div = block_div.find("div", attrs={"class": "WB_innerwrap"})
                        if block_title == u"基本信息":
                            profile_div = inner_div
                        elif block_title == u"工作信息":
                            career_div = inner_div
                        elif block_title == u"教育信息":
                            edu_div = inner_div
                        elif block_title == u"标签信息":
                            tags_div = inner_div
                elif domid == "Pl_Official_Header__1":
                    header_soup = beautiful_soup(data["html"])
                    weibo_user.info.avatar = header_soup.find("div", attrs={"class": "pf_head_pic"}).find("img")["src"]
                    weibo_user.info.n_follows = int(
                        header_soup.find("ul", attrs={"class": "user_atten"})
                        .find("strong", attrs={"node-type": "follow"})
                        .text
                    )
                    weibo_user.info.n_fans = int(
                        header_soup.find("ul", attrs={"class": "user_atten"})
                        .find("strong", attrs={"node-type": "fans"})
                        .text
                    )
                elif domid.startswith("Pl_Core_T8CustomTriColumn__"):
                    # new style friends info
                    header_soup = beautiful_soup(data["html"])
                    tds = header_soup.find("table", attrs={"class": "tb_counter"}).find_all("td")
                    weibo_user.info.n_follows = int(tds[0].find("strong").text)
                    weibo_user.info.n_fans = int(tds[1].find("strong").text)
                elif domid.startswith("Pl_Official_Headerv6__"):
                    # new style avatar info
                    header_soup = beautiful_soup(data["html"])
                    weibo_user.info.avatar = header_soup.find("p", attrs="photo_wrap").find("img")["src"]
            elif "STK" in text:
                text = text.replace("STK && STK.pageletM && STK.pageletM.view(", "")[:-1]
                data = json.loads(text)
                pid = data["pid"]
                if pid == "pl_profile_infoBase":
                    profile_div = beautiful_soup(data["html"])
                elif pid == "pl_profile_infoCareer":
                    career_div = beautiful_soup(data["html"])
                elif pid == "pl_profile_infoEdu":
                    edu_div = beautiful_soup(data["html"])
                elif pid == "pl_profile_infoTag":
                    tags_div = beautiful_soup(data["html"])
                elif pid == "pl_profile_photo":
                    soup = beautiful_soup(data["html"])
                    weibo_user.info.avatar = soup.find("img")["src"]

        profile_map = {
            u"昵称": {"field": "nickname"},
            u"所在地": {"field": "location"},
            u"性别": {"field": "sex", "func": lambda s: True if s == u"男" else False},
            u"生日": {"field": "birth"},
            u"博客": {"field": "blog"},
            u"个性域名": {"field": "site"},
            u"简介": {"field": "intro"},
            u"邮箱": {"field": "email"},
            u"QQ": {"field": "qq"},
            u"MSN": {"field": "msn"},
        }
        if profile_div is not None:
            if not new_style:
                divs = profile_div.find_all(attrs={"class": "pf_item"})
            else:
                divs = profile_div.find_all("li", attrs={"class": "li_1"})
            for div in divs:
                if not new_style:
                    k = div.find(attrs={"class": "label"}).text.strip()
                    v = div.find(attrs={"class": "con"}).text.strip()
                else:
                    k = div.find("span", attrs={"class": "pt_title"}).text.strip().strip(u"：")
                    d = div.find("span", attrs={"class": "pt_detail"})
                    if d:
                        v = d.text.strip()
                    else:
                        v = div.find("a").text.strip()
                if k in profile_map:
                    if k == u"个性域名" and "|" in v:
                        v = v.split("|")[1].strip()
                    func = (lambda s: s) if "func" not in profile_map[k] else profile_map[k]["func"]
                    v = func(v)
                    setattr(weibo_user.info, profile_map[k]["field"], v)

        weibo_user.info.work = []
        if career_div is not None:
            if not new_style:
                for div in career_div.find_all(attrs={"class": "con"}):
                    work_info = WorkInfo()
                    ps = div.find_all("p")
                    for p in ps:
                        a = p.find("a")
                        if a is not None:
                            work_info.name = a.text
                            text = p.text
                            if "(" in text:
                                work_info.date = text.strip().split("(")[1].strip(")")
                        else:
                            text = p.text
                            if text.startswith(u"地区："):
                                work_info.location = text.split(u"：", 1)[1]
                            elif text.startswith(u"职位："):
                                work_info.position = text.split(u"：", 1)[1]
                            else:
                                work_info.detail = text
                    weibo_user.info.work.append(work_info)
            else:
                li = career_div.find("li", attrs={"class": "li_1"})
                for span in li.find_all("span", attrs={"class": "pt_detail"}):
                    work_info = WorkInfo()

                    text = span.text
                    a = span.find("a")
                    if a is not None:
                        work_info.name = a.text
                    if "(" in text:
                        work_info.date = (
                            text.strip()
                            .split("(")[1]
                            .replace("\r", "")
                            .replace("\n", "")
                            .replace("\t", "")
                            .split(")", 1)[0]
                        )

                    for l in text.split("\r\n"):
                        l = l.strip()
                        if len(l) == 0:
                            continue
                        if l.startswith(u"地区："):
                            work_info.location = l.split(u"：", 1)[1]
                        elif l.startswith(u"职位："):
                            work_info.position = l.split(u"：", 1)[1]
                        else:
                            work_info.detail = text.replace("\r", "").replace("\n", "").replace("\t", "").strip()

                    weibo_user.info.work.append(work_info)

        weibo_user.info.edu = []
        if edu_div is not None:
            if not new_style:
                for div in edu_div.find_all(attrs={"class": "con"}):
                    edu_info = EduInfo()
                    ps = div.find_all("p")
                    for p in ps:
                        a = p.find("a")
                        text = p.text
                        if a is not None:
                            edu_info.name = a.text
                            if "(" in text:
                                edu_info.date = text.strip().split("(")[1].strip().strip(")")
                        else:
                            edu_info.detail = text
                    weibo_user.info.edu.append(edu_info)
            else:
                span = edu_div.find("li", attrs={"class": "li_1"}).find("span", attrs={"class": "pt_detail"})
                text = span.text
                names = []
                for a in span.find_all("a"):
                    names.append(a.text)

                for idx, name in enumerate(names):
                    start_pos = text.find(name) + len(name)
                    if idx < len(names) - 1:
                        end_pos = text.find(names[idx + 1], start_pos)
                    else:
                        end_pos = len(text)
                    t = text[start_pos:end_pos]

                    edu_info = EduInfo()
                    edu_info.name = name
                    if "(" in text:
                        edu_info.date = (
                            t.strip()
                            .split("(")[1]
                            .replace("\r", "")
                            .replace("\n", "")
                            .replace("\t", "")
                            .split(")", 1)[0]
                        )
                        t = t[t.find(")") + 1 :]
                    text = text[end_pos:]
                    edu_info.detail = t.replace("\r", "").replace("\n", "").replace("\t", "").strip()
                    weibo_user.info.edu.append(edu_info)

        weibo_user.info.tags = []
        if tags_div is not None:
            if not new_style:
                for div in tags_div.find_all(attrs={"class": "con"}):
                    for a in div.find_all("a"):
                        weibo_user.info.tags.append(a.text)
            else:
                for a in tags_div.find("span", attrs={"class": "pt_detail"}).find_all("a"):
                    weibo_user.info.tags.append(a.text.strip())

        weibo_user.save()
        #         self.logger.debug('parse %s finish' % url)

        # counter add one for the profile url
        self.counter.inc("processed_profile_page", 1)

Example #29

0

Show file

File: parsers.py Project: renchaorevee/cola

    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []

        url = url or self.url
        br = self.opener.browse_open(url)
        self.logger.debug("load %s finish" % url)
        soup = beautiful_soup(br.response().read())

        if not self.check(url, br):
            return [], []

        weibo_user = self.get_weibo_user()
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()

        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all("script"):
            text = script.text
            if "FM.view" in text:
                text = text.replace("FM.view(", "")[:-1]
                data = json.loads(text)
                domid = data["domid"]
                if domid == "Pl_Official_LeftInfo__13":
                    info_soup = beautiful_soup(data["html"])
                    info_div = info_soup.find("div", attrs={"class": "profile_pinfo"})
                    for block_div in info_div.find_all("div", attrs={"class": "infoblock"}):
                        block_title = block_div.find("form").text.strip()
                        if block_title == u"基本信息":
                            profile_div = block_div
                        elif block_title == u"工作信息":
                            career_div = block_div
                        elif block_title == u"教育信息":
                            edu_div = block_div
                        elif block_title == u"标签信息":
                            tags_div = block_div
                elif domid == "Pl_Official_Header__1":
                    header_soup = beautiful_soup(data["html"])
                    weibo_user.info.avatar = header_soup.find("div", attrs={"class": "pf_head_pic"}).find("img")["src"]
            elif "STK" in text:
                text = text.replace("STK && STK.pageletM && STK.pageletM.view(", "")[:-1]
                data = json.loads(text)
                pid = data["pid"]
                if pid == "pl_profile_infoBase":
                    profile_div = beautiful_soup(data["html"])
                elif pid == "pl_profile_infoCareer":
                    career_div = beautiful_soup(data["html"])
                elif pid == "pl_profile_infoEdu":
                    edu_div = beautiful_soup(data["html"])
                elif pid == "pl_profile_infoTag":
                    tags_div = beautiful_soup(data["html"])
                elif pid == "pl_profile_photo":
                    soup = beautiful_soup(data["html"])
                    weibo_user.info.avatar = soup.find("img")["src"]

        profile_map = {
            u"昵称": {"field": "nickname"},
            u"所在地": {"field": "location"},
            u"性别": {"field": "sex", "func": lambda s: True if s == u"男" else False},
            u"生日": {"field": "birth"},
            u"博客": {"field": "blog"},
            u"个性域名": {"field": "site"},
            u"简介": {"field": "intro"},
            u"邮箱": {"field": "email"},
            u"QQ": {"field": "qq"},
            u"MSN": {"field": "msn"},
        }
        if profile_div is not None:
            for div in profile_div.find_all(attrs={"class": "pf_item"}):
                k = div.find(attrs={"class": "label"}).text.strip()
                v = div.find(attrs={"class": "con"}).text.strip()
                if k in profile_map:
                    if k == u"个性域名" and "|" in v:
                        v = v.split("|")[1].strip()
                    func = (lambda s: s) if "func" not in profile_map[k] else profile_map[k]["func"]
                    v = func(v)
                    setattr(weibo_user.info, profile_map[k]["field"], v)

        weibo_user.info.work = []
        if career_div is not None:
            for div in career_div.find_all(attrs={"class": "con"}):
                work_info = WorkInfo()
                ps = div.find_all("p")
                for p in ps:
                    a = p.find("a")
                    if a is not None:
                        work_info.name = a.text
                        text = p.text
                        if "(" in text:
                            work_info.date = text.strip().split("(")[1].strip(")")
                    else:
                        text = p.text
                        if text.startswith(u"地区："):
                            work_info.location = text.split(u"：", 1)[1]
                        elif text.startswith(u"职位："):
                            work_info.position = text.split(u"：", 1)[1]
                        else:
                            work_info.detail = text
                weibo_user.info.work.append(work_info)

        weibo_user.info.edu = []
        if edu_div is not None:
            for div in edu_div.find_all(attrs={"class": "con"}):
                edu_info = EduInfo()
                ps = div.find_all("p")
                for p in ps:
                    a = p.find("a")
                    text = p.text
                    if a is not None:
                        edu_info.name = a.text
                        if "(" in text:
                            edu_info.date = text.strip().split("(")[1].strip(")")
                    else:
                        edu_info.detail = text
                weibo_user.info.edu.append(edu_info)

        weibo_user.info.tags = []
        if tags_div is not None:
            for div in tags_div.find_all(attrs={"class": "con"}):
                for a in div.find_all("a"):
                    weibo_user.info.tags.append(a.text)

        weibo_user.save()
        self.logger.debug("parse %s finish" % url)
        return [], []

Example #30

0

Show file

    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        br = self.opener.browse_open(url)
        jsn = json.loads(br.response().read())

        #         self.logger.debug('load %s finish' % url)

        soup = beautiful_soup(jsn['data']['html'])
        current_page = jsn['data']['page']['pagenum']
        n_pages = jsn['data']['page']['totalpage']

        if not self.check(url, br):
            return

        decodes = urldecode(url)
        mid = decodes.get('id', decodes.get('mid'))

        mblog = self.bundle.current_mblog
        if mblog is None or mblog.mid != mid:
            try:
                mblog = getattr(MicroBlog,
                                'objects').get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
                mblog.save()

        def set_instance(instance, dl):
            instance.avatar = dl.find('dt').find('img')['src']
            date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text
            date = date.strip().strip('(').strip(')')
            instance.created = self.parse_datetime(date)
            for div in dl.find_all('div'):
                div.extract()
            for span in dl.find_all('span'):
                span.extract()
            instance.content = dl.text.strip()

        counter_type = None
        if url.startswith('http://weibo.com/aj/comment'):
            counter_type = 'comment'
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                uid = dl.find('a', usercard=True)['usercard'].split("id=",
                                                                    1)[1]
                comment = Comment(uid=uid)
                set_instance(comment, dl)

                mblog.comments.append(comment)
        elif url.startswith('http://weibo.com/aj/mblog/info'):
            counter_type = 'forward'
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                forward_again_a = dl.find(
                    'a',
                    attrs={
                        'action-type': re.compile("^(feed_list|fl)_forward$")
                    })
                uid = urldecode('?%s' % forward_again_a['action-data'])['uid']
                forward = Forward(uid=uid, mid=dl['mid'])
                set_instance(forward, dl)

                mblog.forwards.append(forward)
        elif url.startswith('http://weibo.com/aj/like'):
            counter_type = 'like'
            lis = soup.find_all('li', uid=True)
            for li in lis:
                like = Like(uid=li['uid'])
                like.avatar = li.find('img')['src']

                mblog.likes.append(like)

        mblog.save()
        #       self.logger.debug('parse %s finish' % url)

        # counter add one for the processed forward or comment or like list url
        if counter_type is not None:
            self.counter.inc('processed_%s_list_page' % counter_type, 1)

        if current_page >= n_pages:
            return

        params = urldecode(url)
        new_params = urldecode('?page=%s' % (current_page + 1))
        params.update(new_params)
        params['__rnd'] = int(time.time() * 1000)
        next_page = '%s?%s' % (url.split('?')[0], urllib.urlencode(params))
        yield next_page