Example #1
0
    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        try:
            br = self.opener.browse_open(url)
        except Exception as e:
            print(e)
            print('休息10分钟!')
            time.sleep(60 * 10)
        try:
            jsn = json.loads(br.response().read())
        except ValueError:
            print('休息10分钟!')
            time.sleep(60 * 10)
            raise FetchBannedError('fetch banned by weibo server')


#         self.logger.debug('load %s finish' % url)

        try:
            soup = beautiful_soup(jsn['data']['html'])
            current_page = jsn['data']['page']['pagenum']
            n_pages = jsn['data']['page']['totalpage']
        except KeyError:
            print('休息10分钟!')
            time.sleep(60 * 10)
            raise FetchBannedError('fetch banned by weibo server')

        if not self.check(url, br):
            return

        decodes = urldecode(url)
        mid = decodes.get('id', decodes.get('mid'))

        mblog = self.bundle.current_mblog
        if mblog is None or mblog.mid != mid:
            try:
                mblog = getattr(MicroBlog,
                                'objects').get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
                mblog.save()

        def set_instance(instance, dl):
            instance.avatar = dl.find('dt').find('img')['src']
            date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text
            date = date.strip().strip('(').strip(')')
            instance.created = self.parse_datetime(date)
            for div in dl.find_all('div'):
                div.extract()
            for span in dl.find_all('span'):
                span.extract()
            instance.content = dl.text.strip()

        counter_type = None
        #print(u'微博:'+mblog.content+u'的评论')
        if url.startswith('http://weibo.com/aj/comment'):
            counter_type = 'comment'
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                uid = dl.find('a', usercard=True)['usercard'].split("id=",
                                                                    1)[1]
                comment = Comment(uid=uid)
                set_instance(comment, dl)
                #print(u'微博评论:'+comment.content)

                mblog.comments.append(comment)
        elif url.startswith('http://weibo.com/aj/mblog/info'):
            counter_type = 'forward'
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                forward_again_a = dl.find(
                    'a',
                    attrs={
                        'action-type': re.compile("^(feed_list|fl)_forward$")
                    })
                uid = urldecode('?%s' % forward_again_a['action-data'])['uid']
                forward = Forward(uid=uid, mid=dl['mid'])
                set_instance(forward, dl)

                mblog.forwards.append(forward)
        elif url.startswith('http://weibo.com/aj/like'):
            counter_type = 'like'
            lis = soup.find_all('li', uid=True)
            for li in lis:
                like = Like(uid=li['uid'])
                like.avatar = li.find('img')['src']

                mblog.likes.append(like)

        mblog.save()
        #       self.logger.debug('parse %s finish' % url)

        # counter add one for the processed forward or comment or like list url
        if counter_type is not None:
            self.counter.inc('processed_%s_list_page' % counter_type, 1)

        if current_page >= n_pages:
            return

        params = urldecode(url)
        new_params = urldecode('?page=%s' % (current_page + 1))
        params.update(new_params)
        params['__rnd'] = int(time.time() * 1000)
        next_page = '%s?%s' % (url.split('?')[0], urllib.urlencode(params))
        yield next_page
Example #2
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []

        url = url or self.url
        br = None
        jsn = None
        try:
            br = self.opener.browse_open(url)
            self.logger.debug('load %s finish' % url)
            jsn = json.loads(br.response().read())
        except (ValueError, URLError) as e:
            return self._error(url, e)

        soup = beautiful_soup(jsn['data']['html'])
        current_page = jsn['data']['page']['pagenum']
        n_pages = jsn['data']['page']['totalpage']

        if not self.check(url, br):
            return [], []

        decodes = urldecode(url)
        mid = decodes.get('id', decodes.get('mid'))

        mblog = self.bundle.current_mblog
        if mblog is None or mblog.mid != mid:
            try:
                mblog = getattr(MicroBlog,
                                'objects').get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
                mblog.save()

        def set_instance(instance, dl):
            instance.avatar = dl.find('dt').find('img')['src']
            date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text
            date = date.strip().strip('(').strip(')')
            instance.created = self.parse_datetime(date)
            for div in dl.find_all('div'):
                div.extract()
            for span in dl.find_all('span'):
                span.extract()
            instance.content = dl.text.strip()

        if url.startswith('http://weibo.com/aj/comment'):
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                comment = Comment(uid=self.uid)
                set_instance(comment, dl)

                mblog.comments.append(comment)
        elif url.startswith('http://weibo.com/aj/mblog/info'):
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                forward = Forward(uid=self.uid, mid=dl['mid'])
                set_instance(forward, dl)

                mblog.forwards.append(forward)
        elif url.startswith('http://weibo.com/aj/like'):
            lis = soup.find_all('li', uid=True)
            for li in lis:
                like = Like(uid=li['uid'])
                like.avatar = li.find('img')['src']

                mblog.likes.append(like)

        try:
            mblog.save()
            self.logger.debug('parse %s finish' % url)
        except ValidationError, e:
            return self._error(url, e)
Example #3
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []

        url = url or self.url
        br = None
        jsn = None
        try:
            br = self.opener.browse_open(url)
            self.logger.debug("load %s finish" % url)
            jsn = json.loads(br.response().read())
        except (ValueError, URLError) as e:
            return self._error(url, e)

        soup = beautiful_soup(jsn["data"]["html"])
        current_page = jsn["data"]["page"]["pagenum"]
        n_pages = jsn["data"]["page"]["totalpage"]

        if not self.check(url, br):
            return [], []

        decodes = urldecode(url)
        mid = decodes.get("id", decodes.get("mid"))

        mblog = self.bundle.current_mblog
        if mblog is None or mblog.mid != mid:
            try:
                mblog = getattr(MicroBlog, "objects").get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
                mblog.save()

        def set_instance(instance, dl):
            instance.avatar = dl.find("dt").find("img")["src"]
            date = dl.find("dd").find("span", attrs={"class": "S_txt2"}).text
            date = date.strip().strip("(").strip(")")
            instance.created = self.parse_datetime(date)
            for div in dl.find_all("div"):
                div.extract()
            for span in dl.find_all("span"):
                span.extract()
            instance.content = dl.text.strip()

        if url.startswith("http://weibo.com/aj/comment"):
            dls = soup.find_all("dl", mid=True)
            for dl in dls:
                comment = Comment(uid=self.uid)
                set_instance(comment, dl)

                mblog.comments.append(comment)
        elif url.startswith("http://weibo.com/aj/mblog/info"):
            dls = soup.find_all("dl", mid=True)
            for dl in dls:
                forward = Forward(uid=self.uid, mid=dl["mid"])
                set_instance(forward, dl)

                mblog.forwards.append(forward)
        elif url.startswith("http://weibo.com/aj/like"):
            lis = soup.find_all("li", uid=True)
            for li in lis:
                like = Like(uid=li["uid"])
                like.avatar = li.find("img")["src"]

                mblog.likes.append(like)

        try:
            mblog.save()
            self.logger.debug("parse %s finish" % url)
        except ValidationError, e:
            return self._error(url, e)
Example #4
0
 def parse(self, url=None):
     if self.bundle.exists == False:
         return [], []
     
     url = url or self.url
     br = None
     jsn = None
     try:
         br = self.opener.browse_open(url)
         self.logger.debug('load %s finish' % url)
         jsn = json.loads(br.response().read())
     except (ValueError, URLError) as e:
         return self._error(url, e)
     
     soup = beautiful_soup(jsn['data']['html'])
     current_page = jsn['data']['page']['pagenum']
     n_pages = jsn['data']['page']['totalpage']
     
     if not self.check(url, br):
         return [], []
     
     decodes = urldecode(url)
     mid = decodes.get('id', decodes.get('mid'))
     
     mblog = self.bundle.current_mblog
     if mblog is None or mblog.mid != mid:
         try:
             mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid))
         except DoesNotExist:
             mblog = MicroBlog(mid=mid, uid=self.uid)
             mblog.save()
     
     def set_instance(instance, dl):
         instance.avatar = dl.find('dt').find('img')['src']
         date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text
         date = date.strip().strip('(').strip(')')
         instance.created = self.parse_datetime(date)
         for div in dl.find_all('div'): div.extract()
         for span in dl.find_all('span'): span.extract()
         instance.content = dl.text.strip()
     
     if url.startswith('http://weibo.com/aj/comment'):
         dls = soup.find_all('dl', mid=True)
         for dl in dls:
             comment = Comment(uid=self.uid)
             set_instance(comment, dl)
             
             mblog.comments.append(comment)
     elif url.startswith('http://weibo.com/aj/mblog/info'):
         dls = soup.find_all('dl', mid=True)
         for dl in dls:
             forward = Forward(uid=self.uid, mid=dl['mid'])
             set_instance(forward, dl)
             
             mblog.forwards.append(forward)
     elif url.startswith('http://weibo.com/aj/like'):
         lis = soup.find_all('li', uid=True)
         for li in lis:
             like = Like(uid=li['uid'])
             like.avatar = li.find('img')['src']
             
             mblog.likes.append(like)
     
     try:
         mblog.save()
         self.logger.debug('parse %s finish' % url)
     except ValidationError, e:
         return self._error(url, e)
Example #5
0
    def parse(self, url=None):
        if self.bundle.exists is False:
            return
        
        url = url or self.url
        br = self.opener.browse_open(url)
        try:
            jsn = json.loads(br.response().read())
        except ValueError:
            raise FetchBannedError('fetch banned by weibo server')

#         self.logger.debug('load %s finish' % url)

        try:
            soup = beautiful_soup(jsn['data']['html'])
            current_page = jsn['data']['page']['pagenum']
            n_pages = jsn['data']['page']['totalpage']
        except KeyError:
            raise FetchBannedError('fetch banned by weibo server')
        
        if not self.check(url, br):
            return
        
        decodes = urldecode(url)
        mid = decodes.get('id', decodes.get('mid'))
        
        mblog = self.bundle.current_mblog
        if mblog is None or mblog.mid != mid:
            try:
                mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
                mblog.save()
        
        def set_instance(instance, dl):
            instance.avatar = dl.find('dt').find('img')['src']
            date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text
            date = date.strip().strip('(').strip(')')
            instance.created = self.parse_datetime(date)
            for div in dl.find_all('div'): div.extract()
            for span in dl.find_all('span'): span.extract()
            instance.content = dl.text.strip()

        counter_type = None
        if url.startswith('http://weibo.com/aj/comment'):
            counter_type = 'comment'
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                uid = dl.find('a', usercard=True)['usercard'].split("id=", 1)[1]
                comment = Comment(uid=uid)
                set_instance(comment, dl)
                
                mblog.comments.append(comment)
        elif url.startswith('http://weibo.com/aj/mblog/info'):
            counter_type = 'forward'
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                forward_again_a = dl.find('a', attrs={'action-type': re.compile("^(feed_list|fl)_forward$")})
                uid = urldecode('?%s' % forward_again_a['action-data'])['uid']
                forward = Forward(uid=uid, mid=dl['mid'])
                set_instance(forward, dl)
                
                mblog.forwards.append(forward)
        elif url.startswith('http://weibo.com/aj/like'):
            counter_type = 'like'
            lis = soup.find_all('li', uid=True)
            for li in lis:
                like = Like(uid=li['uid'])
                like.avatar = li.find('img')['src']
                
                mblog.likes.append(like)

        mblog.save()
#       self.logger.debug('parse %s finish' % url)

        # counter add one for the processed forward or comment or like list url
        if counter_type is not None:
            self.counter.inc('processed_%s_list_page' % counter_type, 1)

        if current_page >= n_pages:
            return
        
        params = urldecode(url)
        new_params = urldecode('?page=%s'%(current_page+1))
        params.update(new_params)
        params['__rnd'] = int(time.time()*1000)
        next_page = '%s?%s' % (url.split('?')[0] , urllib.urlencode(params))
        yield next_page
Example #6
0
 def parse(self, url=None):
     if self.bundle.exists == False:
         return [], []
     
     url = url or self.url
     br = self.opener.browse_open(url)
     jsn = json.loads(br.response().read())
     soup = BeautifulSoup(jsn['data']['html'])
     current_page = jsn['data']['page']['pagenum']
     n_pages = jsn['data']['page']['totalpage']
     
     if not self.check(url, br):
         return [], []
     
     weibo_user = self.get_weibo_user()
     decodes = urldecode(url)
     mid = decodes.get('id', decodes.get('mid'))
     
     mblogs = weibo_user.statuses
     mblog = None
     for m in mblogs:
         if m.mid == mid:
             mblog = m
             break
     if mblog is None:
         mblog = MicroBlog(mid=mid)
         weibo_user.statuses.append(mblog)
     
     def set_instance(instance, dl):
         instance.avatar = dl.find('dt').find('img')['src']
         date = dl.find('dd').find('span', attrs={'class': 'S_txt2'}).text
         date = date.strip().strip('(').strip(')')
         instance.created = self.parse_datetime(date)
         for div in dl.find_all('div'): div.extract()
         for span in dl.find_all('span'): span.extract()
         instance.content = dl.text.strip()
     
     if url.startswith('http://weibo.com/aj/comment'):
         dls = soup.find_all('dl', mid=True)
         for dl in dls:
             comment = Comment(uid=self.uid)
             set_instance(comment, dl)
             
             mblog.comments.append(comment)
     elif url.startswith('http://weibo.com/aj/mblog/info'):
         dls = soup.find_all('dl', mid=True)
         for dl in dls:
             forward = Forward(uid=self.uid, mid=dl['mid'])
             set_instance(forward, dl)
             
             mblog.forwards.append(forward)
     elif url.startswith('http://weibo.com/aj/like'):
         lis = soup.find_all('li', uid=True)
         for li in lis:
             like = Like(uid=li['uid'])
             like.avatar = li.find('img')['src']
             
             mblog.likes.append(like)
     
     weibo_user.save()
     
     if current_page >= n_pages:
         return [], []
     
     params = urldecode(url)
     next_page = soup.find('a', attrs={'class': 'btn_page_next'})
     if next_page is not None:
         try:
             next_page_str = next_page['action-data']
         except KeyError:
             next_page_str = next_page.find('span')['action-data']
         new_params = urldecode('?%s'%next_page_str)
         params.update(new_params)
         params['__rnd'] = int(time.time()*1000)
         next_page = '%s?%s' % (url.split('?')[0] , urllib.urlencode(params))
         return [next_page, ], []
 
     return [], []
Example #7
0
    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        br = self.opener.browse_open(url)
        jsn = json.loads(br.response().read())

        #         self.logger.debug('load %s finish' % url)

        soup = beautiful_soup(jsn["data"]["html"])
        current_page = jsn["data"]["page"]["pagenum"]
        n_pages = jsn["data"]["page"]["totalpage"]

        if not self.check(url, br):
            return

        decodes = urldecode(url)
        mid = decodes.get("id", decodes.get("mid"))

        mblog = self.bundle.current_mblog
        if mblog is None or mblog.mid != mid:
            try:
                mblog = getattr(MicroBlog, "objects").get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
                mblog.save()

        def set_instance(instance, dl):
            instance.avatar = dl.find("dt").find("img")["src"]
            date = dl.find("dd").find(attrs={"class": "S_txt2"}).text
            date = date.strip().strip("(").strip(")")
            instance.created = self.parse_datetime(date)
            for div in dl.find_all("div"):
                div.extract()
            for span in dl.find_all("span"):
                span.extract()
            instance.content = dl.text.strip()

        counter_type = None
        if url.startswith("http://weibo.com/aj/comment"):
            counter_type = "comment"
            dls = soup.find_all("dl", mid=True)
            for dl in dls:
                uid = dl.find("a", usercard=True)["usercard"].split("id=", 1)[1]
                comment = Comment(uid=uid)
                set_instance(comment, dl)

                mblog.comments.append(comment)
        elif url.startswith("http://weibo.com/aj/mblog/info"):
            counter_type = "forward"
            dls = soup.find_all("dl", mid=True)
            for dl in dls:
                forward_again_a = dl.find("a", attrs={"action-type": re.compile("^(feed_list|fl)_forward$")})
                uid = urldecode("?%s" % forward_again_a["action-data"])["uid"]
                forward = Forward(uid=uid, mid=dl["mid"])
                set_instance(forward, dl)

                mblog.forwards.append(forward)
        elif url.startswith("http://weibo.com/aj/like"):
            counter_type = "like"
            lis = soup.find_all("li", uid=True)
            for li in lis:
                like = Like(uid=li["uid"])
                like.avatar = li.find("img")["src"]

                mblog.likes.append(like)

        mblog.save()
        #       self.logger.debug('parse %s finish' % url)

        # counter add one for the processed forward or comment or like list url
        if counter_type is not None:
            self.counter.inc("processed_%s_list_page" % counter_type, 1)

        if current_page >= n_pages:
            return

        params = urldecode(url)
        new_params = urldecode("?page=%s" % (current_page + 1))
        params.update(new_params)
        params["__rnd"] = int(time.time() * 1000)
        next_page = "%s?%s" % (url.split("?")[0], urllib.urlencode(params))
        yield next_page
Example #8
0
                mblog.forwards.append(forward)
                self.bundle.fetched_last_forward_id = mblog.mid
                self.bundle.fetched_weibo_forward_num = self.bundle.fetched_weibo_forward_num + 1;
        elif url.startswith('http://weibo.com/aj/like'):
            lis = soup.find_all('li', uid=True)
            for li in lis:
                if fetch_like_limit > 0 and self.bundle.fetched_weibo_like_num >= fetch_like_limit:
                    self.bundle.fetched_weibo_like_num = 0;
                    try:
                        mblog.save()
                        self.logger.debug('parse %s finish' % url)
                    except ValidationError, e:
                        return self._error(url, e)
                    return [],[]
                like = Like(uid=li['uid'])
                like.avatar = li.find('img')['src']
                if fetch_like_limit > 0 and self.bundle.fetched_last_like_id != mblog.mid:
                    self.bundle.fetched_weibo_like_num = 0;
                
                mblog.likes.append(like)
                self.bundle.fetched_last_like_id = mblog.mid
                self.bundle.fetched_weibo_like_num = self.bundle.fetched_weibo_like_num + 1;   

        try:
            mblog.save()
            self.logger.debug('parse %s finish' % url)
        except ValidationError, e:
            return self._error(url, e)
        
        if current_page >= n_pages:
            return [], []