Beispiel #1
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []

        url = url or self.url
        params = urldecode(url)
        br = self.opener.browse_open(url)
        self.logger.debug("load %s finish" % url)

        if not self.check(url, br):
            return [], []

        weibo_user = self.get_weibo_user()

        params["_t"] = 0
        params["__rnd"] = str(int(time.time() * 1000))
        page = int(params.get("page", 1))
        pre_page = int(params.get("pre_page", 0))
        count = 15
        if "pagebar" not in params:
            params["pagebar"] = "0"
            pre_page += 1
        elif params["pagebar"] == "0":
            params["pagebar"] = "1"
        elif params["pagebar"] == "1":
            del params["pagebar"]
            pre_page = page
            page += 1
            count = 50
        params["count"] = count
        params["page"] = page
        params["pre_page"] = pre_page

        data = json.loads(br.response().read())["data"]
        soup = beautiful_soup(data)
        finished = False

        divs = soup.find_all("div", attrs={"class": "WB_feed_type"}, mid=True)
        max_id = None
        next_urls = []
        for div in divs:
            mid = div["mid"]
            if len(mid) == 0:
                continue
            max_id = mid

            if "end_id" not in params:
                params["end_id"] = mid
            if mid in weibo_user.newest_mids:
                finished = True
                break
            if len(self.bundle.newest_mids) < 3:
                self.bundle.newest_mids.append(mid)

            try:
                mblog = getattr(MicroBlog, "objects").get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
            content_div = div.find("div", attrs={"class": "WB_text", "node-type": "feed_list_content"})
            for img in content_div.find_all("img", attrs={"type": "face"}):
                img.replace_with(img["title"])
            mblog.content = content_div.text
            is_forward = div.get("isforward") == "1"
            if is_forward:
                name_a = div.find("a", attrs={"class": "WB_name", "node-type": "feed_list_originNick"})
                text_a = div.find("div", attrs={"class": "WB_text", "node-type": "feed_list_reason"})
                if name_a is not None and text_a is not None:
                    mblog.forward = "%s: %s" % (name_a.text, text_a.text)
            mblog.created = parse(div.select("a.S_link2.WB_time")[0]["title"])

            if self.bundle.last_update is None or mblog.created > self.bundle.last_update:
                self.bundle.last_update = mblog.created
            if weibo_user.last_update is not None and mblog.created <= weibo_user.last_update:
                finished = True
                break

            likes = div.find("a", attrs={"action-type": "feed_list_like"}).text
            likes = likes.strip("(").strip(")")
            likes = 0 if len(likes) == 0 else int(likes)
            mblog.n_likes = likes
            forwards = div.find("a", attrs={"action-type": "feed_list_forward"}).text
            if "(" not in forwards:
                mblog.n_forwards = 0
            else:
                mblog.n_forwards = int(forwards.strip().split("(", 1)[1].strip(")"))
            comments = div.find("a", attrs={"action-type": "feed_list_comment"}).text
            if "(" not in comments:
                mblog.n_comments = 0
            else:
                mblog.n_comments = int(comments.strip().split("(", 1)[1].strip(")"))

            # fetch geo info
            map_info = div.find("div", attrs={"class": "map_data"})
            if map_info is not None:
                geo = Geo()
                geo.location = map_info.text.split("-")[0].strip()
                geo_info = urldecode("?" + map_info.find("a")["action-data"])["geo"]
                geo.longtitude, geo.latitude = tuple([float(itm) for itm in geo_info.split(",", 1)])
                mblog.geo = geo

            # fetch forwards and comments
            if fetch_forward or fetch_comment or fetch_like:
                query = {"id": mid, "_t": 0, "__rnd": int(time.time() * 1000)}
                query_str = urllib.urlencode(query)
                if fetch_forward and mblog.n_forwards > 0:
                    forward_url = "http://weibo.com/aj/comment/big?%s" % query_str
                    next_urls.append(forward_url)
                if fetch_comment and mblog.n_comments > 0:
                    comment_url = "http://weibo.com/aj/mblog/info/big?%s" % query_str
                    next_urls.append(comment_url)
                if fetch_like and mblog.n_likes > 0:
                    query = {"mid": mid, "_t": 0, "__rnd": int(time.time() * 1000)}
                    query_str = urllib.urlencode(query)
                    like_url = "http://weibo.com/aj/like/big?%s" % query_str
                    next_urls.append(like_url)

            mblog.save()

        if "pagebar" in params:
            params["max_id"] = max_id
        else:
            del params["max_id"]
        self.logger.debug("parse %s finish" % url)

        # if not has next page
        if len(divs) == 0 or finished:
            weibo_user = self.get_weibo_user()
            for mid in self.bundle.newest_mids:
                if mid not in self.bundle.newest_mids:
                    weibo_user.newest_mids.append(mid)
            while len(weibo_user.newest_mids) > 3:
                weibo_user.newest_mids.pop()
            weibo_user.last_update = self.bundle.last_update
            weibo_user.save()
            return [], []

        next_urls.append("%s?%s" % (url.split("?")[0], urllib.urlencode(params)))
        return next_urls, []
Beispiel #2
0
    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        params = urldecode(url)
        try:
            br = self.opener.browse_open(url)
        except Exception as e:
            print(e)
            print('休息10分钟!')
            time.sleep(60 *
                       10)  #         self.logger.debug('load %s finish' % url)

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()

        params['_t'] = 0
        params['__rnd'] = str(int(time.time() * 1000))
        page = int(params.get('page', 1))
        pre_page = int(params.get('pre_page', 0))
        count = 15
        if 'pagebar' not in params:
            params['pagebar'] = '0'
            pre_page += 1
        elif params['pagebar'] == '0':
            params['pagebar'] = '1'
        elif params['pagebar'] == '1':
            del params['pagebar']
            pre_page = page
            page += 1
            count = 50
        params['count'] = count
        params['page'] = page
        params['pre_page'] = pre_page

        try:
            data = json.loads(br.response().read())['data']
        except Exception as e:
            print(e)
            print('休息10分钟!')
            time.sleep(60 * 10)  # self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(data)
        finished = False

        divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True)
        max_id = None
        for div in divs:
            mid = div['mid']
            if len(mid) == 0:
                continue
            max_id = mid

            if 'end_id' not in params:
                params['end_id'] = mid
            if mid in weibo_user.newest_mids:
                finished = True
                break
            if len(self.bundle.newest_mids) < 3:
                self.bundle.newest_mids.append(mid)

            try:
                mblog = getattr(MicroBlog,
                                'objects').get(Q(mid=mid) & Q(uid=self.uid))
                continue  #认为已经爬过了
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
            content_div = div.find('div',
                                   attrs={
                                       'class': 'WB_text',
                                       'node-type': 'feed_list_content'
                                   })
            for img in content_div.find_all("img", attrs={'type': 'face'}):
                img.replace_with(img['title'])
            mblog.content = content_div.text
            #print(u'微博内容:'+mblog.content)
            is_forward = div.get('isforward') == '1'
            if is_forward:
                mblog.omid = div['omid']
                name_a = div.find('a',
                                  attrs={
                                      'class': 'WB_name',
                                      'node-type': 'feed_list_originNick'
                                  })
                text_a = div.find('div',
                                  attrs={
                                      'class': 'WB_text',
                                      'node-type': 'feed_list_reason'
                                  })
                if name_a is not None and text_a is not None:
                    mblog.forward = '%s: %s' % (name_a.text, text_a.text)
            mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title'])

            if self.bundle.last_update is None or mblog.created > self.bundle.last_update:
                self.bundle.last_update = mblog.created
            if weibo_user.last_update is not None and \
                mblog.created <= weibo_user.last_update:
                finished = True
                break

            func_div = div.find_all('div', 'WB_func')[-1]
            action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t)

            likes = func_div.find('a',
                                  attrs={
                                      'action-type': action_type_re("like")
                                  }).text
            likes = likes.strip('(').strip(')')
            likes = 0 if len(likes) == 0 else int(likes)
            mblog.n_likes = likes
            forwards = func_div.find('a',
                                     attrs={
                                         'action-type':
                                         action_type_re("forward")
                                     }).text
            if '(' not in forwards:
                mblog.n_forwards = 0
            else:
                mblog.n_forwards = int(forwards.strip().split('(',
                                                              1)[1].strip(')'))
            comments = func_div.find('a',
                                     attrs={
                                         'action-type':
                                         action_type_re('comment')
                                     }).text
            if '(' not in comments:
                mblog.n_comments = 0
            else:
                mblog.n_comments = int(comments.strip().split('(',
                                                              1)[1].strip(')'))

            # fetch geo info
            map_info = div.find("div", attrs={'class': 'map_data'})
            if map_info is not None:
                geo = Geo()
                geo.location = map_info.text.split('-')[0].strip()
                geo_info = urldecode("?" +
                                     map_info.find('a')['action-data'])['geo']
                geo.longtitude, geo.latitude = tuple(
                    [float(itm) for itm in geo_info.split(',', 1)])
                mblog.geo = geo

            # fetch forwards and comments
            if fetch_forward or fetch_comment or fetch_like:
                query = {'id': mid, '_t': 0, '__rnd': int(time.time() * 1000)}
                query_str = urllib.urlencode(query)
                if fetch_forward and mblog.n_forwards > 0:
                    forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str
                    yield forward_url
                if fetch_comment and mblog.n_comments > fetch_n_comments:  #只抓取评论数多于规定条数的微博
                    comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str
                    yield comment_url
                if fetch_like and mblog.n_likes > 0:
                    query = {
                        'mid': mid,
                        '_t': 0,
                        '__rnd': int(time.time() * 1000)
                    }
                    query_str = urllib.urlencode(query)
                    like_url = 'http://weibo.com/aj/like/big?%s' % query_str
                    yield like_url

            mblog.save()

        if 'pagebar' in params:
            params['max_id'] = max_id
        else:
            del params['max_id']


#         self.logger.debug('parse %s finish' % url)

# counter add one for the processed weibo list url
        self.counter.inc('processed_weibo_list_page', 1)

        # if not has next page
        if len(divs) == 0 or finished:
            weibo_user = self.get_weibo_user()
            for mid in self.bundle.newest_mids:
                if mid not in weibo_user.newest_mids:
                    weibo_user.newest_mids.append(mid)
            while len(weibo_user.newest_mids) > 3:
                weibo_user.newest_mids.pop()
            weibo_user.last_update = self.bundle.last_update
            weibo_user.save()
            return

        yield '%s?%s' % (url.split('?')[0], urllib.urlencode(params))
    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []
        
        url = url or self.url
        params = urldecode(url)
        br = self.opener.browse_open(url)
        self.logger.debug('load %s finish' % url)
        
        if not self.check(url, br):
            return [], []
            
        weibo_user = self.get_weibo_user()
        
        params['_t'] = 0
        params['__rnd'] = str(int(time.time() * 1000))
        page = int(params.get('page', 1))
        pre_page = int(params.get('pre_page', 0))
        count = 15
        if 'pagebar' not in params:
            params['pagebar'] = '0'
            pre_page += 1
        elif params['pagebar'] == '0':
            params['pagebar'] = '1'
        elif params['pagebar'] == '1':
            del params['pagebar']
            pre_page = page
            page += 1
            count = 50
        params['count'] = count
        params['page'] = page
        params['pre_page'] = pre_page
        
        data = json.loads(br.response().read())['data']
        soup = beautiful_soup(data)
        finished = False
        
        divs = soup.find_all('div', attrs={'class': 'WB_feed_type'},  mid=True)
        max_id = None
        next_urls = []
        for div in divs:
            mid = div['mid']
            if len(mid) == 0:
                continue
            max_id = mid
            
            if 'end_id' not in params:
                params['end_id'] = mid
            if mid in weibo_user.newest_mids:
                finished = True
                break
            if len(self.bundle.newest_mids) < 3:
                self.bundle.newest_mids.append(mid)
            
            try:
                mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
            content_div = div.find('div', attrs={
                'class': 'WB_text', 
                'node-type': 'feed_list_content'
            })
            for img in content_div.find_all("img", attrs={'type': 'face'}):
                img.replace_with(img['title']);
            mblog.content = content_div.text
            is_forward = div.get('isforward') == '1'
            if is_forward:
                mblog.omid = div['omid']
                name_a = div.find('a', attrs={
                    'class': 'WB_name', 
                    'node-type': 'feed_list_originNick'
                })
                text_a = div.find('div', attrs={
                    'class': 'WB_text',
                    'node-type': 'feed_list_reason'
                })
                if name_a is not None and text_a is not None:
                    mblog.forward = '%s: %s' % (
                        name_a.text,
                        text_a.text
                    )
            #mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title'])
            #ci
            #
            temp = parse(div.select('a.S_link2.WB_time')[0]['title'])
            tempstring = temp.strftime("%Y-%m-%d-%H-%M-%S")
            list=tempstring.split('-')
            tempyear=list[0]
            tempmonth=list[1]
            tempday=list[2]
            temphour=list[3]
            tempmin=list[4]
            tempsec=list[5]
            temptime=time.mktime(datetime(int(tempyear),int(tempmonth),int(tempday),int(temphour),int(tempmin),int(tempsec)).timetuple())
            print temptime
            
            timevalue=open("D:\\09Limited_buffer\\earlywarningbyci\\cola\\contrib\\weibo\\timevalue.txt","r")
            time_re=timevalue.readline()
            timevalue.close()
            list=time_re.split()
            starttime=list[0]
            endtime=list[1]
            print starttime
            temptime=round(float(temptime))
            starttime=round(float(starttime))
            endtime=round(float(endtime))
            if temptime>=starttime and temptime<=endtime:
                mblog.created = temp
                #timeok = True
                print "------OKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOK-----"
            else:
                if temptime<starttime:
                    print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
                    time.sleep(5)
                    return [], []
                #continue
            #
            # 
            if self.bundle.last_update is None or mblog.created > self.bundle.last_update:
                self.bundle.last_update = mblog.created
            if weibo_user.last_update is not None and \
                mblog.created <= weibo_user.last_update:
                finished = True
                break

            func_div = div.find_all('div', 'WB_func')[-1]
            action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t)
            
            likes = func_div.find('a', attrs={'action-type': action_type_re("like")}).text
            likes = likes.strip('(').strip(')')
            likes = 0 if len(likes) == 0 else int(likes)
            mblog.n_likes = likes
            forwards = func_div.find('a', attrs={'action-type': action_type_re("forward")}).text
            if '(' not in forwards:
                mblog.n_forwards = 0
            else:
                mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')'))
            comments = func_div.find('a', attrs={'action-type': action_type_re('comment')}).text
            if '(' not in comments:
                mblog.n_comments = 0
            else:
                mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')'))
                
            # fetch geo info
            map_info = div.find("div", attrs={'class': 'map_data'})
            if map_info is not None:
                geo = Geo()
                geo.location = map_info.text.split('-')[0].strip()
                geo_info = urldecode("?"+map_info.find('a')['action-data'])['geo']
                geo.longtitude, geo.latitude = tuple([float(itm) for itm in geo_info.split(',', 1)])
                mblog.geo = geo
            
            # fetch forwards and comments
            if fetch_forward or fetch_comment or fetch_like:
                query = {'id': mid, '_t': 0, '__rnd': int(time.time()*1000)}
                query_str = urllib.urlencode(query)
                if fetch_forward and mblog.n_forwards > 0:
                    forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str
                    next_urls.append(forward_url)
                if fetch_comment and mblog.n_comments > 0:
                    comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str
                    next_urls.append(comment_url)
                if fetch_like and mblog.n_likes > 0:
                    query = {'mid': mid, '_t': 0, '__rnd': int(time.time()*1000)}
                    query_str = urllib.urlencode(query)
                    like_url = 'http://weibo.com/aj/like/big?%s' % query_str
                    next_urls.append(like_url)
            
            mblog.save()
        
        if 'pagebar' in params:
            params['max_id'] = max_id
        else:
            del params['max_id']
        self.logger.debug('parse %s finish' % url)
                
        # if not has next page
        if len(divs) == 0 or finished:
            weibo_user = self.get_weibo_user()
            for mid in self.bundle.newest_mids:
                if mid not in weibo_user.newest_mids:
                    weibo_user.newest_mids.append(mid)
            while len(weibo_user.newest_mids) > 3:
                weibo_user.newest_mids.pop()
            weibo_user.last_update = self.bundle.last_update
            weibo_user.save()
            return [], []
        
        next_urls.append('%s?%s'%(url.split('?')[0], urllib.urlencode(params)))
        return next_urls, []
Beispiel #4
0
 def parse(self, url=None):
     if self.bundle.exists == False:
         return [], []
     
     url = url or self.url
     params = urldecode(url)
     br = self.opener.browse_open(url)
     self.logger.debug('load %s finish' % url)
     
     if not self.check(url, br):
         return [], []
         
     weibo_user = self.get_weibo_user()
     
     params['_t'] = 0
     params['__rnd'] = str(int(time.time() * 1000))
     page = int(params.get('page', 1))
     pre_page = int(params.get('pre_page', 0))
     count = 15
     if 'pagebar' not in params:
         params['pagebar'] = '0'
         pre_page += 1
     elif params['pagebar'] == '0':
         params['pagebar'] = '1'
     elif params['pagebar'] == '1':
         del params['pagebar']
         pre_page = page
         page += 1
         count = 50
     params['count'] = count
     params['page'] = page
     params['pre_page'] = pre_page
     
     data = json.loads(br.response().read())['data']
     soup = beautiful_soup(data)
     finished = False
     
     divs = soup.find_all('div', attrs={'class': 'WB_feed_type'},  mid=True)
     max_id = None
     next_urls = []
     for div in divs:
         mid = div['mid']
         if len(mid) == 0:
             continue
         max_id = mid
         
         if 'end_id' not in params:
             params['end_id'] = mid
         if mid in weibo_user.newest_mids:
             finished = True
             break
         if len(self.bundle.newest_mids) < 3:
             self.bundle.newest_mids.append(mid)
         
         try:
             mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid))
         except DoesNotExist:
             mblog = MicroBlog(mid=mid, uid=self.uid)
         content_div = div.find('div', attrs={
             'class': 'WB_text', 
             'node-type': 'feed_list_content'
         })
         for img in content_div.find_all("img", attrs={'type': 'face'}):
             img.replace_with(img['title']);
         mblog.content = content_div.text
         is_forward = div.get('isforward') == '1'
         if is_forward:
             name_a = div.find('a', attrs={
                 'class': 'WB_name', 
                 'node-type': 'feed_list_originNick'
             })
             text_a = div.find('div', attrs={
                 'class': 'WB_text',
                 'node-type': 'feed_list_reason'
             })
             if name_a is not None and text_a is not None:
                 mblog.forward = '%s: %s' % (
                     name_a.text,
                     text_a.text
                 )
         mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title'])
         
         if self.bundle.last_update is None or mblog.created > self.bundle.last_update:
             self.bundle.last_update = mblog.created
         if weibo_user.last_update is not None and \
             mblog.created <= weibo_user.last_update:
             finished = True
             break
         
         likes = div.find('a', attrs={'action-type': 'feed_list_like'}).text
         likes = likes.strip('(').strip(')')
         likes = 0 if len(likes) == 0 else int(likes)
         mblog.n_likes = likes
         forwards = div.find('a', attrs={'action-type': 'feed_list_forward'}).text
         if '(' not in forwards:
             mblog.n_forwards = 0
         else:
             mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')'))
         comments = div.find('a', attrs={'action-type': 'feed_list_comment'}).text
         if '(' not in comments:
             mblog.n_comments = 0
         else:
             mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')'))
             
         # fetch geo info
         map_info = div.find("div", attrs={'class': 'map_data'})
         if map_info is not None:
             geo = Geo()
             geo.location = map_info.text.split('-')[0].strip()
             geo_info = urldecode("?"+map_info.find('a')['action-data'])['geo']
             geo.longtitude, geo.latitude = tuple([float(itm) for itm in geo_info.split(',', 1)])
             mblog.geo = geo
         
         # fetch forwards and comments
         if fetch_forward or fetch_comment or fetch_like:
             query = {'id': mid, '_t': 0, '__rnd': int(time.time()*1000)}
             query_str = urllib.urlencode(query)
             if fetch_forward and mblog.n_forwards > 0:
                 forward_url = 'http://weibo.com/aj/comment/big?%s' % query_str
                 next_urls.append(forward_url)
             if fetch_comment and mblog.n_comments > 0:
                 comment_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str
                 next_urls.append(comment_url)
             if fetch_like and mblog.n_likes > 0:
                 query = {'mid': mid, '_t': 0, '__rnd': int(time.time()*1000)}
                 query_str = urllib.urlencode(query)
                 like_url = 'http://weibo.com/aj/like/big?%s' % query_str
                 next_urls.append(like_url)
         
         mblog.save()
     
     if 'pagebar' in params:
         params['max_id'] = max_id
     else:
         del params['max_id']
     self.logger.debug('parse %s finish' % url)
             
     # if not has next page
     if len(divs) == 0 or finished:
         weibo_user = self.get_weibo_user()
         for mid in self.bundle.newest_mids:
             if mid not in self.bundle.newest_mids:
                 weibo_user.newest_mids.append(mid)
         while len(weibo_user.newest_mids) > 3:
             weibo_user.newest_mids.pop()
         weibo_user.last_update = self.bundle.last_update
         weibo_user.save()
         return [], []
     
     next_urls.append('%s?%s'%(url.split('?')[0], urllib.urlencode(params)))
     return next_urls, []
Beispiel #5
0
    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        params = urldecode(url)
        try:
            br = self.opener.browse_open(url)
        except URLError:
            raise FetchBannedError()

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()

        params['_t'] = 0
        params['__rnd'] = str(int(time.time() * 1000))
        page = int(params.get('page', 1))
        pre_page = int(params.get('pre_page', 0))
        count = 15
        if 'pagebar' not in params:
            params['pagebar'] = '0'
            pre_page += 1
        elif params['pagebar'] == '0':
            params['pagebar'] = '1'
        elif params['pagebar'] == '1':
            del params['pagebar']
            pre_page = page
            page += 1
            count = 50
        params['count'] = count
        params['page'] = page
        params['pre_page'] = pre_page

        try:
            data = json.loads(br.response().read())['data']
        except (ValueError, KeyError):
            raise FetchBannedError('fetch banned by weibo server')
        soup = beautiful_soup(data)
        finished = False

        divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True)
        max_id = None
        for div in divs:
            mid = div['mid']
            if len(mid) == 0:
                continue
            max_id = mid
            blog_create_date = parse(
                div.select('a.S_link2.WB_time')[0]['title'])
            # skip all following blogs if create date less than effective start date
            if (blog_create_date - effective_start_date).days < 0:
                self.logger.info(
                    "%s: blog has sync up after %s" %
                    (self.uid, effective_start_date.strftime("%Y%m%d")))
                finished = True
                break

            if 'end_id' not in params:
                params['end_id'] = mid
            # skip
            #if weibo_user.newest_mids and not mid in weibo_user.newest_mids:
            #    self.logger.info("%s: reach earliest blog %s" % (self.uid,mid))
            #    finished = True
            #    break
            if len(self.bundle.newest_mids) < 3:
                self.bundle.newest_mids.append(mid)

            try:
                mblog = getattr(MicroBlog,
                                'objects').get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
            content_div = div.find('div',
                                   attrs={
                                       'class': 'WB_text',
                                       'node-type': 'feed_list_content'
                                   })
            for img in content_div.find_all("img", attrs={'type': 'face'}):
                img.replace_with(img['title'])
            mblog.content = content_div.text
            is_forward = div.get('isforward')
            if is_forward:
                # write origional user, msg
                mblog.omid = div['omid']
                tbinfos = div['tbinfo'].split('&')
                mblog.ouid = tbinfos[0].split('=')[1]
                name_a = div.find('a',
                                  attrs={
                                      'class': 'WB_name',
                                      'node-type': 'feed_list_originNick'
                                  })
                text_a = div.find('div',
                                  attrs={
                                      'class': 'WB_text',
                                      'node-type': 'feed_list_reason'
                                  })
                if name_a is not None and text_a is not None:
                    mblog.forward = '%s: %s' % (name_a.text, text_a.text)
            mblog.created = blog_create_date
            mblog.last_update = datetime.now()

            func_div = div.find_all('div', 'WB_func')[-1]
            action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t)

            likes = func_div.find('a',
                                  attrs={
                                      'action-type': action_type_re("like")
                                  }).text
            likes = likes.strip('(').strip(')')
            likes = 0 if len(likes) == 0 else int(likes)
            mblog.n_likes = likes
            forwards = func_div.find('a',
                                     attrs={
                                         'action-type':
                                         action_type_re("forward")
                                     }).text
            if '(' not in forwards:
                mblog.n_forwards = 0
            else:
                mblog.n_forwards = int(forwards.strip().split('(',
                                                              1)[1].strip(')'))
            comments = func_div.find('a',
                                     attrs={
                                         'action-type':
                                         action_type_re('comment')
                                     }).text
            if '(' not in comments:
                mblog.n_comments = 0
            else:
                mblog.n_comments = int(comments.strip().split('(',
                                                              1)[1].strip(')'))

            # fetch geo info
            map_info = div.find("div", attrs={'class': 'map_data'})
            if map_info is not None:
                geo = Geo()
                geo.location = map_info.text.split('-')[0].strip()
                geo_info = urldecode("?" +
                                     map_info.find('a')['action-data'])['geo']
                geo.longtitude, geo.latitude = tuple(
                    [float(itm) for itm in geo_info.split(',', 1)])
                mblog.geo = geo
            # has_video
            div_video = div.find(
                'div', attrs={'node-type': 'fl_h5_video_disp'}) or div.find(
                    'span', attrs={'class': 'icon_playvideo'})
            mblog.has_video = True if div_video else False
            mblog.save()
            self.counter.inc('processed_weibo_posts', 1)

            # fetch forwards and comments
            if self.uid in starts:
                query = {'id': mid, '_t': 0, '__rnd': int(time.time() * 1000)}
                query_str = urllib.urlencode(query)
                if fetch_forward and mblog.n_forwards > 0:
                    forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str
                    yield forward_url
                if fetch_comment and mblog.n_comments > 0:
                    comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str
                    yield comment_url
                if fetch_like and mblog.n_likes > 0:
                    query = {
                        'mid': mid,
                        '_t': 0,
                        '__rnd': int(time.time() * 1000)
                    }
                    query_str = urllib.urlencode(query)
                    like_url = 'http://weibo.com/aj/like/big?%s' % query_str
                    yield like_url

            yield '%s?%s' % (url.split('?')[0], urllib.urlencode(params))

        if params.has_key('pagebar'):
            params['max_id'] = max_id
        elif params.has_key('max_id'):
            del params['max_id']


#         self.logger.debug('parse %s finish' % url)

# counter add one for the processed weibo list url
        self.counter.inc('processed_weibo_list_page', 1)

        # if not has next page
        if len(divs) == 0 or finished:
            weibo_user = self.get_weibo_user()
            for mid in self.bundle.newest_mids:
                if mid not in weibo_user.newest_mids:
                    weibo_user.newest_mids.append(mid)
            while len(weibo_user.newest_mids) > 3:
                weibo_user.newest_mids.pop()
            weibo_user.last_update = self.bundle.last_update
            weibo_user.save()
            return
Beispiel #6
0
 def parse(self, url=None):
     if self.bundle.exists == False:
         return
     
     url = url or self.url
     params = urldecode(url)
     br = self.opener.browse_open(url)
     
     if not self.check(url, br):
         return
         
     weibo_user = self.get_weibo_user()
     
     params['_t'] = 0
     params['__rnd'] = str(int(time.time() * 1000))
     page = int(params.get('page', 1))
     pre_page = params.get('pre_page', 1)
     if 'pagebar' not in params:
         params['pagebar'] = '0'
     elif params['pagebar'] == '0':
         params['pagebar'] = '1'
     elif params['pagebar'] == '1':
         del params['pagebar']
         pre_page = page
         page += 1
     count = 15
     params['count'] = count
     params['page'] = page
     params['pre_page'] = pre_page
     
     data = json.loads(br.response().read())['data']
     soup = BeautifulSoup(data)
     
     divs = soup.find_all('div', attrs={'class': 'WB_feed_type'},  mid=True)
     max_id = None
     for div in divs:
         mid = div['mid']
         if len(mid) == 0:
             continue
         max_id = mid
         
         if 'end_id' not in params:
             params['end_id'] = mid
         if weibo_user.newest_mid is not None and \
             weibo_user.newest_mid == mid:
             break
         
         mblog = MicroBlog(mid=mid)
         mblog.content = div.find('div', attrs={
             'class': 'WB_text', 
             'node-type': 'feed_list_content'
         }).text
         is_forward = div.get('isforward') == '1'
         if is_forward:
             name_a = div.find('a', attrs={
                 'class': 'WB_name', 
                 'node-type': 'feed_list_originNick'
             })
             text_a = div.find('div', attrs={
                 'class': 'WB_text',
                 'node-type': 'feed_list_reason'
             })
             if name_a is not None and text_a is not None:
                 mblog.forward = '%s: %s' % (
                     name_a.text,
                     text_a.text
                 )
         mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title'])
         likes = div.find('a', attrs={'action-type': 'feed_list_like'}).text
         likes = likes.strip('(').strip(')')
         likes = 0 if len(likes) == 0 else int(likes)
         mblog.likes = likes
         forwards = div.find('a', attrs={'action-type': 'feed_list_forward'}).text
         if '(' not in forwards:
             mblog.forwards = 0
         else:
             mblog.forwards = int(forwards.strip().split('(', 1)[1].strip(')'))
         comments = div.find('a', attrs={'action-type': 'feed_list_comment'}).text
         if '(' not in comments:
             mblog.comments = 0
         else:
             mblog.comments = int(comments.strip().split('(', 1)[1].strip(')'))
         
         weibo_user.statuses.append(mblog)
             
     params['max_id'] = max_id
             
     # if not has next page
     if len(divs) < count:
         weibo_user.newest_mid = params['end_id']
         weibo_user.save()
         return [], []
     
     weibo_user.save()
     return ['%s?%s'%(url.split('?')[0], urllib.urlencode(params)), ], []
Beispiel #7
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []
        
        url = url or self.url
        params = urldecode(url)
        br = self.opener.browse_open(url)
        self.logger.debug('load %s finish' % url)
        
        if not self.check(url, br):
            return [], []
            
        weibo_user = self.get_weibo_user()
        
        params['_t'] = 0
        params['__rnd'] = str(int(time.time() * 1000))
        page = int(params.get('page', 1))
        pre_page = int(params.get('pre_page', 0))
        count = 15
        if 'pagebar' not in params:
            params['pagebar'] = '0'
            pre_page += 1
        elif params['pagebar'] == '0':
            params['pagebar'] = '1'
        elif params['pagebar'] == '1':
            del params['pagebar']
            pre_page = page
            page += 1
            count = 50
        params['count'] = count
        params['page'] = page
        params['pre_page'] = pre_page
        
        data = json.loads(br.response().read())['data']
        soup = beautiful_soup(data)
        finished = False
        
        divs = soup.find_all('div', attrs={'class': 'WB_feed_type'},  mid=True)
        max_id = None
        next_urls = []
        for div in divs:
            mid = div['mid']
            if len(mid) == 0:
                continue
            max_id = mid
            
            if 'end_id' not in params:
                params['end_id'] = mid
            if mid in weibo_user.newest_mids:
                finished = True
                break
            if len(self.bundle.newest_mids) < 3:
                self.bundle.newest_mids.append(mid)
            
            try:
                mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
            content_div = div.find('div', attrs={
                'class': 'WB_text', 
                'node-type': 'feed_list_content'
            })

            mblog.content = content_div.text
            
            # Links
            for content_a in content_div.find_all('a', 
                attrs={'action-type': 'feed_list_url'}):
                href = content_a['href']
                if href not in mblog.links:
                    mblog.links.append(href)
                    
            # tags
            tags_div = content_div.find('div', attrs={'class': 'wTablist2'})
            if tags_div is not None:
                for tag_a in tags_div.find_all('a'):
                    tag = tag_a.text.strip()
                    if len(tag) > 0 and tag not in mblog.tags:
                        mblog.tags.append(tag)
                    
            is_forward = div.get('isforward') == '1'
            if is_forward:
                mblog.omid = div['omid']
            mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title'])
            
            if self.bundle.last_update is None or mblog.created > self.bundle.last_update:
                self.bundle.last_update = mblog.created
            if weibo_user.last_update is not None and \
                mblog.created <= weibo_user.last_update:
                finished = True
                break
            
            likes = div.find('a', attrs={'action-type': 'feed_list_like'}).text
            likes = likes.strip('(').strip(')')
            likes = 0 if len(likes) == 0 else int(likes)
            mblog.n_likes = likes
            forwards = div.find('a', attrs={'action-type': 'feed_list_forward'}).text
            if '(' not in forwards:
                mblog.n_forwards = 0
            else:
                mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')'))
            comments = div.find('a', attrs={'action-type': 'feed_list_comment'}).text
            if '(' not in comments:
                mblog.n_comments = 0
            else:
                mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')'))
            
            # fetch forwards and comments
            if fetch_forward or fetch_comment or fetch_like:
                query = {'id': mid, '_t': 0, '__rnd': int(time.time()*1000)}
                query_str = urllib.urlencode(query)
                if fetch_forward and mblog.n_forwards > 0:
                    forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str
                    next_urls.append(forward_url)
                if fetch_comment and mblog.n_comments > 0:
                    comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str
                    next_urls.append(comment_url)
                if fetch_like and mblog.n_likes > 0:
                    query = {'mid': mid, '_t': 0, '__rnd': int(time.time()*1000)}
                    query_str = urllib.urlencode(query)
                    like_url = 'http://weibo.com/aj/like/big?%s' % query_str
                    next_urls.append(like_url)
            
            mblog.save()
        
        if 'pagebar' in params:
            params['max_id'] = max_id
        else:
            del params['max_id']
        self.logger.debug('parse %s finish' % url)
                
        # if not has next page
        if len(divs) == 0 or finished:
            weibo_user = self.get_weibo_user()
            for mid in self.bundle.newest_mids:
                if mid not in weibo_user.newest_mids:
                    weibo_user.newest_mids.append(mid)
            while len(weibo_user.newest_mids) > 3:
                weibo_user.newest_mids.pop()
            weibo_user.last_update = self.bundle.last_update
            weibo_user.save()
            return [], []
        
        next_urls.append('%s?%s'%(url.split('?')[0], urllib.urlencode(params)))
        return next_urls, []
Beispiel #8
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return

        url = url or self.url
        params = urldecode(url)
        br = self.opener.browse_open(url)

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()
        statuses = weibo_user.statuses

        params['_t'] = 0
        params['__rnd'] = str(int(time.time() * 1000))
        page = int(params.get('page', 1))
        pre_page = params.get('pre_page', 1)
        if 'pagebar' not in params:
            params['pagebar'] = '0'
        elif params['pagebar'] == '0':
            params['pagebar'] = '1'
        elif params['pagebar'] == '1':
            del params['pagebar']
            pre_page = page
            page += 1
        count = 15
        params['count'] = count
        params['page'] = page
        params['pre_page'] = pre_page

        data = json.loads(br.response().read())['data']
        soup = BeautifulSoup(data)

        divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True)
        max_id = None
        for div in divs:
            mid = div['mid']
            if len(mid) == 0:
                continue
            max_id = mid

            if 'end_id' not in params:
                params['end_id'] = mid
            if weibo_user.newest_mid is not None and \
                weibo_user.newest_mid == mid:
                break

            mblog = MicroBlog(mid=mid)
            mblog.content = div.find('div',
                                     attrs={
                                         'class': 'WB_text',
                                         'node-type': 'feed_list_content'
                                     }).text
            is_forward = div.get('isforward') == '1'
            if is_forward:
                mblog.forward = '%s: %s' % (
                    div.find('a',
                             attrs={
                                 'class': 'WB_name',
                                 'node-type': 'feed_list_originNick'
                             }).text,
                    div.find('div',
                             attrs={
                                 'class': 'WB_text',
                                 'node-type': 'feed_list_reason'
                             }).text)
            mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title'])
            likes = div.find('a', attrs={'action-type': 'feed_list_like'}).text
            likes = likes.strip('(').strip(')')
            likes = 0 if len(likes) == 0 else int(likes)
            mblog.likes = likes
            forwards = div.find('a',
                                attrs={
                                    'action-type': 'feed_list_forward'
                                }).text
            if '(' not in forwards:
                mblog.forwards = 0
            else:
                mblog.forwards = int(forwards.strip().split('(',
                                                            1)[1].strip(')'))
            comments = div.find('a',
                                attrs={
                                    'action-type': 'feed_list_comment'
                                }).text
            if '(' not in comments:
                mblog.comments = 0
            else:
                mblog.comments = int(comments.strip().split('(',
                                                            1)[1].strip(')'))

            statuses.append(mblog)

        params['max_id'] = max_id

        # if not has next page
        if len(divs) < count:
            weibo_user.newest_mid = params['end_id']
            weibo_user.save()
            return [], []

        weibo_user.save()
        return [
            '%s?%s' % (url.split('?')[0], urllib.urlencode(params)),
        ], []
Beispiel #9
0
    def save_blog_detail(self, div, mid):
        try:
            mblog = getattr(MicroBlog,
                            'objects').get(Q(mid=mid) & Q(uid=self.uid))
        except DoesNotExist:
            mblog = MicroBlog(mid=mid, uid=self.uid)
        content_div = div.find('div',
                               attrs={
                                   'class': 'WB_text',
                                   'node-type': 'feed_list_content'
                               })
        blog_create_date = parse(
            div.find('a', attrs={'node-type': 'feed_list_item_date'})['title'])

        for img in content_div.find_all("img", attrs={'type': 'face'}):
            img.replace_with(img['title'])
        mblog.content = content_div.text
        is_forward = div.get('isforward')
        if is_forward:
            # write origional user, msg
            mblog.omid = div['omid']
            tbinfos = div['tbinfo'].split('&')
            mblog.ouid = tbinfos[0].split('=')[1]
            name_a = div.find('a',
                              attrs={
                                  'class': 'WB_name',
                                  'node-type': 'feed_list_originNick'
                              })
            text_a = div.find('div',
                              attrs={
                                  'class': 'WB_text',
                                  'node-type': 'feed_list_reason'
                              })
            if name_a is not None and text_a is not None:
                mblog.forward = '%s: %s' % (name_a.text, text_a.text)
        mblog.created = blog_create_date
        mblog.last_update = datetime.now()

        func_div = div.find_all('div',
                                attrs={'node-type': 'feed_list_options'})[-1]
        action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t)

        likes = func_div.find('a',
                              attrs={
                                  'action-type': action_type_re("like")
                              }).find_all('em')[1].text
        likes = likes.strip('(').strip(')').replace(',', '')
        likes = int(likes) if likes and unicode.isdigit(likes) else 0
        mblog.n_likes = likes
        forwards = func_div.find('a',
                                 attrs={
                                     'action-type': action_type_re("forward")
                                 }).find_all('em')[1].text
        forwards = forwards.strip('(').strip(')').replace(',', '')
        mblog.n_forwards = int(
            forwards) if forwards and unicode.isdigit(forwards) else 0
        comments = func_div.find('a',
                                 attrs={
                                     'action-type': action_type_re('comment')
                                 }).find_all('em')[1].text
        comments = comments.strip('(').strip(')').replace(',', '')
        mblog.n_comments = int(
            comments) if comments and unicode.isdigit(comments) else 0

        # fetch geo info
        map_info = div.find("div", attrs={'class': 'map_data'})
        if map_info is not None:
            geo = Geo()
            geo.location = map_info.text.split('-')[0].strip()
            geo_info = urldecode("?" +
                                 map_info.find('a')['action-data'])['geo']
            geo.longtitude, geo.latitude = tuple(
                [float(itm) for itm in geo_info.split(',', 1)])
            mblog.geo = geo
        # has_video
        div_video = div.find('div', attrs={
            'node-type': 'fl_h5_video_disp'
        }) or div.find('span', attrs={'class': 'icon_playvideo'})
        mblog.has_video = True if div_video else False
        mblog.save()
        return mblog