Ejemplo n.º 1
0
 def get_weibo(self, mid, keyword):
     try:
         weibo = getattr(MicroBlog, 'objects').get(Q(mid=mid) & Q(keyword=keyword))
         return weibo, True
     except DoesNotExist:
         weibo = MicroBlog(mid=mid, keyword=keyword)
         weibo.save()
         return weibo, False
Ejemplo n.º 2
0
    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        params = urldecode(url)
        try:
            br = self.opener.browse_open(url)
        except Exception as e:
            print(e)
            print('休息10分钟!')
            time.sleep(60 *
                       10)  #         self.logger.debug('load %s finish' % url)

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()

        params['_t'] = 0
        params['__rnd'] = str(int(time.time() * 1000))
        page = int(params.get('page', 1))
        pre_page = int(params.get('pre_page', 0))
        count = 15
        if 'pagebar' not in params:
            params['pagebar'] = '0'
            pre_page += 1
        elif params['pagebar'] == '0':
            params['pagebar'] = '1'
        elif params['pagebar'] == '1':
            del params['pagebar']
            pre_page = page
            page += 1
            count = 50
        params['count'] = count
        params['page'] = page
        params['pre_page'] = pre_page

        try:
            data = json.loads(br.response().read())['data']
        except Exception as e:
            print(e)
            print('休息10分钟!')
            time.sleep(60 * 10)  # self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(data)
        finished = False

        divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True)
        max_id = None
        for div in divs:
            mid = div['mid']
            if len(mid) == 0:
                continue
            max_id = mid

            if 'end_id' not in params:
                params['end_id'] = mid
            if mid in weibo_user.newest_mids:
                finished = True
                break
            if len(self.bundle.newest_mids) < 3:
                self.bundle.newest_mids.append(mid)

            try:
                mblog = getattr(MicroBlog,
                                'objects').get(Q(mid=mid) & Q(uid=self.uid))
                continue  #认为已经爬过了
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
            content_div = div.find('div',
                                   attrs={
                                       'class': 'WB_text',
                                       'node-type': 'feed_list_content'
                                   })
            for img in content_div.find_all("img", attrs={'type': 'face'}):
                img.replace_with(img['title'])
            mblog.content = content_div.text
            #print(u'微博内容:'+mblog.content)
            is_forward = div.get('isforward') == '1'
            if is_forward:
                mblog.omid = div['omid']
                name_a = div.find('a',
                                  attrs={
                                      'class': 'WB_name',
                                      'node-type': 'feed_list_originNick'
                                  })
                text_a = div.find('div',
                                  attrs={
                                      'class': 'WB_text',
                                      'node-type': 'feed_list_reason'
                                  })
                if name_a is not None and text_a is not None:
                    mblog.forward = '%s: %s' % (name_a.text, text_a.text)
            mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title'])

            if self.bundle.last_update is None or mblog.created > self.bundle.last_update:
                self.bundle.last_update = mblog.created
            if weibo_user.last_update is not None and \
                mblog.created <= weibo_user.last_update:
                finished = True
                break

            func_div = div.find_all('div', 'WB_func')[-1]
            action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t)

            likes = func_div.find('a',
                                  attrs={
                                      'action-type': action_type_re("like")
                                  }).text
            likes = likes.strip('(').strip(')')
            likes = 0 if len(likes) == 0 else int(likes)
            mblog.n_likes = likes
            forwards = func_div.find('a',
                                     attrs={
                                         'action-type':
                                         action_type_re("forward")
                                     }).text
            if '(' not in forwards:
                mblog.n_forwards = 0
            else:
                mblog.n_forwards = int(forwards.strip().split('(',
                                                              1)[1].strip(')'))
            comments = func_div.find('a',
                                     attrs={
                                         'action-type':
                                         action_type_re('comment')
                                     }).text
            if '(' not in comments:
                mblog.n_comments = 0
            else:
                mblog.n_comments = int(comments.strip().split('(',
                                                              1)[1].strip(')'))

            # fetch geo info
            map_info = div.find("div", attrs={'class': 'map_data'})
            if map_info is not None:
                geo = Geo()
                geo.location = map_info.text.split('-')[0].strip()
                geo_info = urldecode("?" +
                                     map_info.find('a')['action-data'])['geo']
                geo.longtitude, geo.latitude = tuple(
                    [float(itm) for itm in geo_info.split(',', 1)])
                mblog.geo = geo

            # fetch forwards and comments
            if fetch_forward or fetch_comment or fetch_like:
                query = {'id': mid, '_t': 0, '__rnd': int(time.time() * 1000)}
                query_str = urllib.urlencode(query)
                if fetch_forward and mblog.n_forwards > 0:
                    forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str
                    yield forward_url
                if fetch_comment and mblog.n_comments > fetch_n_comments:  #只抓取评论数多于规定条数的微博
                    comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str
                    yield comment_url
                if fetch_like and mblog.n_likes > 0:
                    query = {
                        'mid': mid,
                        '_t': 0,
                        '__rnd': int(time.time() * 1000)
                    }
                    query_str = urllib.urlencode(query)
                    like_url = 'http://weibo.com/aj/like/big?%s' % query_str
                    yield like_url

            mblog.save()

        if 'pagebar' in params:
            params['max_id'] = max_id
        else:
            del params['max_id']


#         self.logger.debug('parse %s finish' % url)

# counter add one for the processed weibo list url
        self.counter.inc('processed_weibo_list_page', 1)

        # if not has next page
        if len(divs) == 0 or finished:
            weibo_user = self.get_weibo_user()
            for mid in self.bundle.newest_mids:
                if mid not in weibo_user.newest_mids:
                    weibo_user.newest_mids.append(mid)
            while len(weibo_user.newest_mids) > 3:
                weibo_user.newest_mids.pop()
            weibo_user.last_update = self.bundle.last_update
            weibo_user.save()
            return

        yield '%s?%s' % (url.split('?')[0], urllib.urlencode(params))
Ejemplo n.º 3
0
    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        try:
            br = self.opener.browse_open(url)
        except Exception as e:
            print(e)
            print('休息10分钟!')
            time.sleep(60 * 10)
        try:
            jsn = json.loads(br.response().read())
        except ValueError:
            print('休息10分钟!')
            time.sleep(60 * 10)
            raise FetchBannedError('fetch banned by weibo server')


#         self.logger.debug('load %s finish' % url)

        try:
            soup = beautiful_soup(jsn['data']['html'])
            current_page = jsn['data']['page']['pagenum']
            n_pages = jsn['data']['page']['totalpage']
        except KeyError:
            print('休息10分钟!')
            time.sleep(60 * 10)
            raise FetchBannedError('fetch banned by weibo server')

        if not self.check(url, br):
            return

        decodes = urldecode(url)
        mid = decodes.get('id', decodes.get('mid'))

        mblog = self.bundle.current_mblog
        if mblog is None or mblog.mid != mid:
            try:
                mblog = getattr(MicroBlog,
                                'objects').get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
                mblog.save()

        def set_instance(instance, dl):
            instance.avatar = dl.find('dt').find('img')['src']
            date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text
            date = date.strip().strip('(').strip(')')
            instance.created = self.parse_datetime(date)
            for div in dl.find_all('div'):
                div.extract()
            for span in dl.find_all('span'):
                span.extract()
            instance.content = dl.text.strip()

        counter_type = None
        #print(u'微博:'+mblog.content+u'的评论')
        if url.startswith('http://weibo.com/aj/comment'):
            counter_type = 'comment'
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                uid = dl.find('a', usercard=True)['usercard'].split("id=",
                                                                    1)[1]
                comment = Comment(uid=uid)
                set_instance(comment, dl)
                #print(u'微博评论:'+comment.content)

                mblog.comments.append(comment)
        elif url.startswith('http://weibo.com/aj/mblog/info'):
            counter_type = 'forward'
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                forward_again_a = dl.find(
                    'a',
                    attrs={
                        'action-type': re.compile("^(feed_list|fl)_forward$")
                    })
                uid = urldecode('?%s' % forward_again_a['action-data'])['uid']
                forward = Forward(uid=uid, mid=dl['mid'])
                set_instance(forward, dl)

                mblog.forwards.append(forward)
        elif url.startswith('http://weibo.com/aj/like'):
            counter_type = 'like'
            lis = soup.find_all('li', uid=True)
            for li in lis:
                like = Like(uid=li['uid'])
                like.avatar = li.find('img')['src']

                mblog.likes.append(like)

        mblog.save()
        #       self.logger.debug('parse %s finish' % url)

        # counter add one for the processed forward or comment or like list url
        if counter_type is not None:
            self.counter.inc('processed_%s_list_page' % counter_type, 1)

        if current_page >= n_pages:
            return

        params = urldecode(url)
        new_params = urldecode('?page=%s' % (current_page + 1))
        params.update(new_params)
        params['__rnd'] = int(time.time() * 1000)
        next_page = '%s?%s' % (url.split('?')[0], urllib.urlencode(params))
        yield next_page
Ejemplo n.º 4
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []

        url = url or self.url
        br = None
        jsn = None
        try:
            br = self.opener.browse_open(url)
            self.logger.debug('load %s finish' % url)
            jsn = json.loads(br.response().read())
        except (ValueError, URLError) as e:
            return self._error(url, e)

        soup = beautiful_soup(jsn['data']['html'])
        current_page = jsn['data']['page']['pagenum']
        n_pages = jsn['data']['page']['totalpage']

        if not self.check(url, br):
            return [], []

        decodes = urldecode(url)
        mid = decodes.get('id', decodes.get('mid'))

        mblog = self.bundle.current_mblog
        if mblog is None or mblog.mid != mid:
            try:
                mblog = getattr(MicroBlog,
                                'objects').get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
                mblog.save()

        def set_instance(instance, dl):
            instance.avatar = dl.find('dt').find('img')['src']
            date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text
            date = date.strip().strip('(').strip(')')
            instance.created = self.parse_datetime(date)
            for div in dl.find_all('div'):
                div.extract()
            for span in dl.find_all('span'):
                span.extract()
            instance.content = dl.text.strip()

        if url.startswith('http://weibo.com/aj/comment'):
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                comment = Comment(uid=self.uid)
                set_instance(comment, dl)

                mblog.comments.append(comment)
        elif url.startswith('http://weibo.com/aj/mblog/info'):
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                forward = Forward(uid=self.uid, mid=dl['mid'])
                set_instance(forward, dl)

                mblog.forwards.append(forward)
        elif url.startswith('http://weibo.com/aj/like'):
            lis = soup.find_all('li', uid=True)
            for li in lis:
                like = Like(uid=li['uid'])
                like.avatar = li.find('img')['src']

                mblog.likes.append(like)

        try:
            mblog.save()
            self.logger.debug('parse %s finish' % url)
        except ValidationError, e:
            return self._error(url, e)
Ejemplo n.º 5
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []
        
        url = url or self.url
        params = urldecode(url)
        br = self.opener.browse_open(url)
        self.logger.debug('load %s finish' % url)
        
        if not self.check(url, br):
            return [], []
            
        weibo_user = self.get_weibo_user()
        
        params['_t'] = 0
        params['__rnd'] = str(int(time.time() * 1000))
        page = int(params.get('page', 1))
        pre_page = int(params.get('pre_page', 0))
        count = 15
        if 'pagebar' not in params:
            params['pagebar'] = '0'
            pre_page += 1
        elif params['pagebar'] == '0':
            params['pagebar'] = '1'
        elif params['pagebar'] == '1':
            del params['pagebar']
            pre_page = page
            page += 1
            count = 50
        params['count'] = count
        params['page'] = page
        params['pre_page'] = pre_page
        
        data = json.loads(br.response().read())['data']
        soup = beautiful_soup(data)
        finished = False
        
        divs = soup.find_all('div', attrs={'class': 'WB_feed_type'},  mid=True)
        max_id = None
        next_urls = []
        for div in divs:
            mid = div['mid']
            if len(mid) == 0:
                continue
            max_id = mid
            
            if 'end_id' not in params:
                params['end_id'] = mid
            if mid in weibo_user.newest_mids:
                finished = True
                break
            if len(self.bundle.newest_mids) < 3:
                self.bundle.newest_mids.append(mid)
            
            try:
                mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
            content_div = div.find('div', attrs={
                'class': 'WB_text', 
                'node-type': 'feed_list_content'
            })
            for img in content_div.find_all("img", attrs={'type': 'face'}):
                img.replace_with(img['title']);
            mblog.content = content_div.text
            is_forward = div.get('isforward') == '1'
            if is_forward:
                mblog.omid = div['omid']
                name_a = div.find('a', attrs={
                    'class': 'WB_name', 
                    'node-type': 'feed_list_originNick'
                })
                text_a = div.find('div', attrs={
                    'class': 'WB_text',
                    'node-type': 'feed_list_reason'
                })
                if name_a is not None and text_a is not None:
                    mblog.forward = '%s: %s' % (
                        name_a.text,
                        text_a.text
                    )
            #mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title'])
            #ci
            #
            temp = parse(div.select('a.S_link2.WB_time')[0]['title'])
            tempstring = temp.strftime("%Y-%m-%d-%H-%M-%S")
            list=tempstring.split('-')
            tempyear=list[0]
            tempmonth=list[1]
            tempday=list[2]
            temphour=list[3]
            tempmin=list[4]
            tempsec=list[5]
            temptime=time.mktime(datetime(int(tempyear),int(tempmonth),int(tempday),int(temphour),int(tempmin),int(tempsec)).timetuple())
            print temptime
            
            timevalue=open("D:\\09Limited_buffer\\earlywarningbyci\\cola\\contrib\\weibo\\timevalue.txt","r")
            time_re=timevalue.readline()
            timevalue.close()
            list=time_re.split()
            starttime=list[0]
            endtime=list[1]
            print starttime
            temptime=round(float(temptime))
            starttime=round(float(starttime))
            endtime=round(float(endtime))
            if temptime>=starttime and temptime<=endtime:
                mblog.created = temp
                #timeok = True
                print "------OKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOK-----"
            else:
                if temptime<starttime:
                    print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
                    time.sleep(5)
                    return [], []
                #continue
            #
            # 
            if self.bundle.last_update is None or mblog.created > self.bundle.last_update:
                self.bundle.last_update = mblog.created
            if weibo_user.last_update is not None and \
                mblog.created <= weibo_user.last_update:
                finished = True
                break

            func_div = div.find_all('div', 'WB_func')[-1]
            action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t)
            
            likes = func_div.find('a', attrs={'action-type': action_type_re("like")}).text
            likes = likes.strip('(').strip(')')
            likes = 0 if len(likes) == 0 else int(likes)
            mblog.n_likes = likes
            forwards = func_div.find('a', attrs={'action-type': action_type_re("forward")}).text
            if '(' not in forwards:
                mblog.n_forwards = 0
            else:
                mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')'))
            comments = func_div.find('a', attrs={'action-type': action_type_re('comment')}).text
            if '(' not in comments:
                mblog.n_comments = 0
            else:
                mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')'))
                
            # fetch geo info
            map_info = div.find("div", attrs={'class': 'map_data'})
            if map_info is not None:
                geo = Geo()
                geo.location = map_info.text.split('-')[0].strip()
                geo_info = urldecode("?"+map_info.find('a')['action-data'])['geo']
                geo.longtitude, geo.latitude = tuple([float(itm) for itm in geo_info.split(',', 1)])
                mblog.geo = geo
            
            # fetch forwards and comments
            if fetch_forward or fetch_comment or fetch_like:
                query = {'id': mid, '_t': 0, '__rnd': int(time.time()*1000)}
                query_str = urllib.urlencode(query)
                if fetch_forward and mblog.n_forwards > 0:
                    forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str
                    next_urls.append(forward_url)
                if fetch_comment and mblog.n_comments > 0:
                    comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str
                    next_urls.append(comment_url)
                if fetch_like and mblog.n_likes > 0:
                    query = {'mid': mid, '_t': 0, '__rnd': int(time.time()*1000)}
                    query_str = urllib.urlencode(query)
                    like_url = 'http://weibo.com/aj/like/big?%s' % query_str
                    next_urls.append(like_url)
            
            mblog.save()
        
        if 'pagebar' in params:
            params['max_id'] = max_id
        else:
            del params['max_id']
        self.logger.debug('parse %s finish' % url)
                
        # if not has next page
        if len(divs) == 0 or finished:
            weibo_user = self.get_weibo_user()
            for mid in self.bundle.newest_mids:
                if mid not in weibo_user.newest_mids:
                    weibo_user.newest_mids.append(mid)
            while len(weibo_user.newest_mids) > 3:
                weibo_user.newest_mids.pop()
            weibo_user.last_update = self.bundle.last_update
            weibo_user.save()
            return [], []
        
        next_urls.append('%s?%s'%(url.split('?')[0], urllib.urlencode(params)))
        return next_urls, []
Ejemplo n.º 6
0
    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        params = urldecode(url)
        try:
            br = self.opener.browse_open(url)
        except URLError:
            raise FetchBannedError()

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()

        params['_t'] = 0
        params['__rnd'] = str(int(time.time() * 1000))
        page = int(params.get('page', 1))
        pre_page = int(params.get('pre_page', 0))
        count = 15
        if 'pagebar' not in params:
            params['pagebar'] = '0'
            pre_page += 1
        elif params['pagebar'] == '0':
            params['pagebar'] = '1'
        elif params['pagebar'] == '1':
            del params['pagebar']
            pre_page = page
            page += 1
            count = 50
        params['count'] = count
        params['page'] = page
        params['pre_page'] = pre_page

        try:
            data = json.loads(br.response().read())['data']
        except (ValueError, KeyError):
            raise FetchBannedError('fetch banned by weibo server')
        soup = beautiful_soup(data)
        finished = False

        divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True)
        max_id = None
        for div in divs:
            mid = div['mid']
            if len(mid) == 0:
                continue
            max_id = mid
            blog_create_date = parse(
                div.select('a.S_link2.WB_time')[0]['title'])
            # skip all following blogs if create date less than effective start date
            if (blog_create_date - effective_start_date).days < 0:
                self.logger.info(
                    "%s: blog has sync up after %s" %
                    (self.uid, effective_start_date.strftime("%Y%m%d")))
                finished = True
                break

            if 'end_id' not in params:
                params['end_id'] = mid
            # skip
            #if weibo_user.newest_mids and not mid in weibo_user.newest_mids:
            #    self.logger.info("%s: reach earliest blog %s" % (self.uid,mid))
            #    finished = True
            #    break
            if len(self.bundle.newest_mids) < 3:
                self.bundle.newest_mids.append(mid)

            try:
                mblog = getattr(MicroBlog,
                                'objects').get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
            content_div = div.find('div',
                                   attrs={
                                       'class': 'WB_text',
                                       'node-type': 'feed_list_content'
                                   })
            for img in content_div.find_all("img", attrs={'type': 'face'}):
                img.replace_with(img['title'])
            mblog.content = content_div.text
            is_forward = div.get('isforward')
            if is_forward:
                # write origional user, msg
                mblog.omid = div['omid']
                tbinfos = div['tbinfo'].split('&')
                mblog.ouid = tbinfos[0].split('=')[1]
                name_a = div.find('a',
                                  attrs={
                                      'class': 'WB_name',
                                      'node-type': 'feed_list_originNick'
                                  })
                text_a = div.find('div',
                                  attrs={
                                      'class': 'WB_text',
                                      'node-type': 'feed_list_reason'
                                  })
                if name_a is not None and text_a is not None:
                    mblog.forward = '%s: %s' % (name_a.text, text_a.text)
            mblog.created = blog_create_date
            mblog.last_update = datetime.now()

            func_div = div.find_all('div', 'WB_func')[-1]
            action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t)

            likes = func_div.find('a',
                                  attrs={
                                      'action-type': action_type_re("like")
                                  }).text
            likes = likes.strip('(').strip(')')
            likes = 0 if len(likes) == 0 else int(likes)
            mblog.n_likes = likes
            forwards = func_div.find('a',
                                     attrs={
                                         'action-type':
                                         action_type_re("forward")
                                     }).text
            if '(' not in forwards:
                mblog.n_forwards = 0
            else:
                mblog.n_forwards = int(forwards.strip().split('(',
                                                              1)[1].strip(')'))
            comments = func_div.find('a',
                                     attrs={
                                         'action-type':
                                         action_type_re('comment')
                                     }).text
            if '(' not in comments:
                mblog.n_comments = 0
            else:
                mblog.n_comments = int(comments.strip().split('(',
                                                              1)[1].strip(')'))

            # fetch geo info
            map_info = div.find("div", attrs={'class': 'map_data'})
            if map_info is not None:
                geo = Geo()
                geo.location = map_info.text.split('-')[0].strip()
                geo_info = urldecode("?" +
                                     map_info.find('a')['action-data'])['geo']
                geo.longtitude, geo.latitude = tuple(
                    [float(itm) for itm in geo_info.split(',', 1)])
                mblog.geo = geo
            # has_video
            div_video = div.find(
                'div', attrs={'node-type': 'fl_h5_video_disp'}) or div.find(
                    'span', attrs={'class': 'icon_playvideo'})
            mblog.has_video = True if div_video else False
            mblog.save()
            self.counter.inc('processed_weibo_posts', 1)

            # fetch forwards and comments
            if self.uid in starts:
                query = {'id': mid, '_t': 0, '__rnd': int(time.time() * 1000)}
                query_str = urllib.urlencode(query)
                if fetch_forward and mblog.n_forwards > 0:
                    forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str
                    yield forward_url
                if fetch_comment and mblog.n_comments > 0:
                    comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str
                    yield comment_url
                if fetch_like and mblog.n_likes > 0:
                    query = {
                        'mid': mid,
                        '_t': 0,
                        '__rnd': int(time.time() * 1000)
                    }
                    query_str = urllib.urlencode(query)
                    like_url = 'http://weibo.com/aj/like/big?%s' % query_str
                    yield like_url

            yield '%s?%s' % (url.split('?')[0], urllib.urlencode(params))

        if params.has_key('pagebar'):
            params['max_id'] = max_id
        elif params.has_key('max_id'):
            del params['max_id']


#         self.logger.debug('parse %s finish' % url)

# counter add one for the processed weibo list url
        self.counter.inc('processed_weibo_list_page', 1)

        # if not has next page
        if len(divs) == 0 or finished:
            weibo_user = self.get_weibo_user()
            for mid in self.bundle.newest_mids:
                if mid not in weibo_user.newest_mids:
                    weibo_user.newest_mids.append(mid)
            while len(weibo_user.newest_mids) > 3:
                weibo_user.newest_mids.pop()
            weibo_user.last_update = self.bundle.last_update
            weibo_user.save()
            return
Ejemplo n.º 7
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []

        url = url or self.url
        params = urldecode(url)
        br = self.opener.browse_open(url)
        self.logger.debug('load %s finish' % url)

        if not self.check(url, br):
            return [], []

        weibo_user = self.get_weibo_user()

        params['_t'] = 0
        params['__rnd'] = str(int(time.time() * 1000))
        page = int(params.get('page', 1))
        pre_page = int(params.get('pre_page', 0))
        count = 15
        if 'pagebar' not in params:
            params['pagebar'] = '0'
            pre_page += 1
        elif params['pagebar'] == '0':
            params['pagebar'] = '1'
        elif params['pagebar'] == '1':
            del params['pagebar']
            pre_page = page
            page += 1
            count = 50
        params['count'] = count
        params['page'] = page
        params['pre_page'] = pre_page

        data = json.loads(br.response().read())['data']
        soup = beautiful_soup(data)
        finished = False

        divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True)
        max_id = None
        next_urls = []
        for div in divs:
            mid = div['mid']
            if len(mid) == 0:
                continue
            max_id = mid

            if 'end_id' not in params:
                params['end_id'] = mid
            if mid in weibo_user.newest_mids:
                finished = True
                break
            if len(self.bundle.newest_mids) < 3:
                self.bundle.newest_mids.append(mid)

            try:
                mblog = getattr(MicroBlog,
                                'objects').get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
            content_div = div.find('div',
                                   attrs={
                                       'class': 'WB_text',
                                       'node-type': 'feed_list_content'
                                   })

            # Links
            for content_a in content_div.find_all(
                    'a', attrs={'action-type': 'feed_list_url'}):
                href = content_a['href']
                if href not in mblog.links:
                    mblog.links.append(href)

            # tags
            tags_div = content_div.find('div', attrs={'class': 'wTablist2'})
            if tags_div is not None:
                for tag_a in tags_div.find_all('a'):
                    tag = tag_a.text.strip()
                    if len(tag) > 0 and tag not in mblog.tags:
                        mblog.tags.append(tag)

            is_forward = div.get('isforward') == '1'
            if is_forward:
                mblog.omid = div['omid']
            mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title'])

            if self.bundle.last_update is None or mblog.created > self.bundle.last_update:
                self.bundle.last_update = mblog.created
            if weibo_user.last_update is not None and \
                mblog.created <= weibo_user.last_update:
                finished = True
                break

            likes = div.find('a', attrs={'action-type': 'feed_list_like'}).text
            likes = likes.strip('(').strip(')')
            likes = 0 if len(likes) == 0 else int(likes)
            mblog.n_likes = likes
            forwards = div.find('a',
                                attrs={
                                    'action-type': 'feed_list_forward'
                                }).text
            if '(' not in forwards:
                mblog.n_forwards = 0
            else:
                mblog.n_forwards = int(forwards.strip().split('(',
                                                              1)[1].strip(')'))
            comments = div.find('a',
                                attrs={
                                    'action-type': 'feed_list_comment'
                                }).text
            if '(' not in comments:
                mblog.n_comments = 0
            else:
                mblog.n_comments = int(comments.strip().split('(',
                                                              1)[1].strip(')'))

            # fetch forwards and comments
            if fetch_forward or fetch_comment or fetch_like:
                query = {'id': mid, '_t': 0, '__rnd': int(time.time() * 1000)}
                query_str = urllib.urlencode(query)
                if fetch_forward and mblog.n_forwards > 0:
                    forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str
                    next_urls.append(forward_url)
                if fetch_comment and mblog.n_comments > 0:
                    comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str
                    next_urls.append(comment_url)
                if fetch_like and mblog.n_likes > 0:
                    query = {
                        'mid': mid,
                        '_t': 0,
                        '__rnd': int(time.time() * 1000)
                    }
                    query_str = urllib.urlencode(query)
                    like_url = 'http://weibo.com/aj/like/big?%s' % query_str
                    next_urls.append(like_url)

            mblog.save()

        if 'pagebar' in params:
            params['max_id'] = max_id
        else:
            del params['max_id']
        self.logger.debug('parse %s finish' % url)

        # if not has next page
        if len(divs) == 0 or finished:
            weibo_user = self.get_weibo_user()
            for mid in self.bundle.newest_mids:
                if mid not in weibo_user.newest_mids:
                    weibo_user.newest_mids.append(mid)
            while len(weibo_user.newest_mids) > 3:
                weibo_user.newest_mids.pop()
            weibo_user.last_update = self.bundle.last_update
            weibo_user.save()
            return [], []

        next_urls.append('%s?%s' %
                         (url.split('?')[0], urllib.urlencode(params)))
        return next_urls, []
Ejemplo n.º 8
0
    def save_blog_detail(self, div, mid):
        try:
            mblog = getattr(MicroBlog,
                            'objects').get(Q(mid=mid) & Q(uid=self.uid))
        except DoesNotExist:
            mblog = MicroBlog(mid=mid, uid=self.uid)
        content_div = div.find('div',
                               attrs={
                                   'class': 'WB_text',
                                   'node-type': 'feed_list_content'
                               })
        blog_create_date = parse(
            div.find('a', attrs={'node-type': 'feed_list_item_date'})['title'])

        for img in content_div.find_all("img", attrs={'type': 'face'}):
            img.replace_with(img['title'])
        mblog.content = content_div.text
        is_forward = div.get('isforward')
        if is_forward:
            # write origional user, msg
            mblog.omid = div['omid']
            tbinfos = div['tbinfo'].split('&')
            mblog.ouid = tbinfos[0].split('=')[1]
            name_a = div.find('a',
                              attrs={
                                  'class': 'WB_name',
                                  'node-type': 'feed_list_originNick'
                              })
            text_a = div.find('div',
                              attrs={
                                  'class': 'WB_text',
                                  'node-type': 'feed_list_reason'
                              })
            if name_a is not None and text_a is not None:
                mblog.forward = '%s: %s' % (name_a.text, text_a.text)
        mblog.created = blog_create_date
        mblog.last_update = datetime.now()

        func_div = div.find_all('div',
                                attrs={'node-type': 'feed_list_options'})[-1]
        action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t)

        likes = func_div.find('a',
                              attrs={
                                  'action-type': action_type_re("like")
                              }).find_all('em')[1].text
        likes = likes.strip('(').strip(')').replace(',', '')
        likes = int(likes) if likes and unicode.isdigit(likes) else 0
        mblog.n_likes = likes
        forwards = func_div.find('a',
                                 attrs={
                                     'action-type': action_type_re("forward")
                                 }).find_all('em')[1].text
        forwards = forwards.strip('(').strip(')').replace(',', '')
        mblog.n_forwards = int(
            forwards) if forwards and unicode.isdigit(forwards) else 0
        comments = func_div.find('a',
                                 attrs={
                                     'action-type': action_type_re('comment')
                                 }).find_all('em')[1].text
        comments = comments.strip('(').strip(')').replace(',', '')
        mblog.n_comments = int(
            comments) if comments and unicode.isdigit(comments) else 0

        # fetch geo info
        map_info = div.find("div", attrs={'class': 'map_data'})
        if map_info is not None:
            geo = Geo()
            geo.location = map_info.text.split('-')[0].strip()
            geo_info = urldecode("?" +
                                 map_info.find('a')['action-data'])['geo']
            geo.longtitude, geo.latitude = tuple(
                [float(itm) for itm in geo_info.split(',', 1)])
            mblog.geo = geo
        # has_video
        div_video = div.find('div', attrs={
            'node-type': 'fl_h5_video_disp'
        }) or div.find('span', attrs={'class': 'icon_playvideo'})
        mblog.has_video = True if div_video else False
        mblog.save()
        return mblog