コード例 #1
0
ファイル: parsers.py プロジェクト: Chenxofhit/cola
 def parse(self, url=None):
     if self.bundle.exists == False:
         return [], []
     
     url = url or self.url
     params = urldecode(url)
     br = self.opener.browse_open(url)
     self.logger.debug('load %s finish' % url)
     
     if not self.check(url, br):
         return [], []
         
     weibo_user = self.get_weibo_user()
     
     params['_t'] = 0
     params['__rnd'] = str(int(time.time() * 1000))
     page = int(params.get('page', 1))
     pre_page = int(params.get('pre_page', 0))
     count = 15
     if 'pagebar' not in params:
         params['pagebar'] = '0'
         pre_page += 1
     elif params['pagebar'] == '0':
         params['pagebar'] = '1'
     elif params['pagebar'] == '1':
         del params['pagebar']
         pre_page = page
         page += 1
         count = 50
     params['count'] = count
     params['page'] = page
     params['pre_page'] = pre_page
     
     data = json.loads(br.response().read())['data']
     soup = beautiful_soup(data)
     finished = False
     
     divs = soup.find_all('div', attrs={'class': 'WB_feed_type'},  mid=True)
     max_id = None
     next_urls = []
     for div in divs:
         mid = div['mid']
         if len(mid) == 0:
             continue
         max_id = mid
         
         if 'end_id' not in params:
             params['end_id'] = mid
         if mid in weibo_user.newest_mids:
             finished = True
             break
         if len(self.bundle.newest_mids) < 3:
             self.bundle.newest_mids.append(mid)
         
         try:
             mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid))
         except DoesNotExist:
             mblog = MicroBlog(mid=mid, uid=self.uid)
         content_div = div.find('div', attrs={
             'class': 'WB_text', 
             'node-type': 'feed_list_content'
         })
         for img in content_div.find_all("img", attrs={'type': 'face'}):
             img.replace_with(img['title']);
         mblog.content = content_div.text
         is_forward = div.get('isforward') == '1'
         if is_forward:
             name_a = div.find('a', attrs={
                 'class': 'WB_name', 
                 'node-type': 'feed_list_originNick'
             })
             text_a = div.find('div', attrs={
                 'class': 'WB_text',
                 'node-type': 'feed_list_reason'
             })
             if name_a is not None and text_a is not None:
                 mblog.forward = '%s: %s' % (
                     name_a.text,
                     text_a.text
                 )
         mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title'])
         
         if self.bundle.last_update is None or mblog.created > self.bundle.last_update:
             self.bundle.last_update = mblog.created
         if weibo_user.last_update is not None and \
             mblog.created <= weibo_user.last_update:
             finished = True
             break
         
         likes = div.find('a', attrs={'action-type': 'feed_list_like'}).text
         likes = likes.strip('(').strip(')')
         likes = 0 if len(likes) == 0 else int(likes)
         mblog.n_likes = likes
         forwards = div.find('a', attrs={'action-type': 'feed_list_forward'}).text
         if '(' not in forwards:
             mblog.n_forwards = 0
         else:
             mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')'))
         comments = div.find('a', attrs={'action-type': 'feed_list_comment'}).text
         if '(' not in comments:
             mblog.n_comments = 0
         else:
             mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')'))
             
         # fetch geo info
         map_info = div.find("div", attrs={'class': 'map_data'})
         if map_info is not None:
             geo = Geo()
             geo.location = map_info.text.split('-')[0].strip()
             geo_info = urldecode("?"+map_info.find('a')['action-data'])['geo']
             geo.longtitude, geo.latitude = tuple([float(itm) for itm in geo_info.split(',', 1)])
             mblog.geo = geo
         
         # fetch forwards and comments
         if fetch_forward or fetch_comment or fetch_like:
             query = {'id': mid, '_t': 0, '__rnd': int(time.time()*1000)}
             query_str = urllib.urlencode(query)
             if fetch_forward and mblog.n_forwards > 0:
                 forward_url = 'http://weibo.com/aj/comment/big?%s' % query_str
                 next_urls.append(forward_url)
             if fetch_comment and mblog.n_comments > 0:
                 comment_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str
                 next_urls.append(comment_url)
             if fetch_like and mblog.n_likes > 0:
                 query = {'mid': mid, '_t': 0, '__rnd': int(time.time()*1000)}
                 query_str = urllib.urlencode(query)
                 like_url = 'http://weibo.com/aj/like/big?%s' % query_str
                 next_urls.append(like_url)
         
         mblog.save()
     
     if 'pagebar' in params:
         params['max_id'] = max_id
     else:
         del params['max_id']
     self.logger.debug('parse %s finish' % url)
             
     # if not has next page
     if len(divs) == 0 or finished:
         weibo_user = self.get_weibo_user()
         for mid in self.bundle.newest_mids:
             if mid not in self.bundle.newest_mids:
                 weibo_user.newest_mids.append(mid)
         while len(weibo_user.newest_mids) > 3:
             weibo_user.newest_mids.pop()
         weibo_user.last_update = self.bundle.last_update
         weibo_user.save()
         return [], []
     
     next_urls.append('%s?%s'%(url.split('?')[0], urllib.urlencode(params)))
     return next_urls, []
コード例 #2
0
ファイル: parsers.py プロジェクト: renchaorevee/cola
    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []

        url = url or self.url
        params = urldecode(url)
        br = self.opener.browse_open(url)
        self.logger.debug("load %s finish" % url)

        if not self.check(url, br):
            return [], []

        weibo_user = self.get_weibo_user()

        params["_t"] = 0
        params["__rnd"] = str(int(time.time() * 1000))
        page = int(params.get("page", 1))
        pre_page = int(params.get("pre_page", 0))
        count = 15
        if "pagebar" not in params:
            params["pagebar"] = "0"
            pre_page += 1
        elif params["pagebar"] == "0":
            params["pagebar"] = "1"
        elif params["pagebar"] == "1":
            del params["pagebar"]
            pre_page = page
            page += 1
            count = 50
        params["count"] = count
        params["page"] = page
        params["pre_page"] = pre_page

        data = json.loads(br.response().read())["data"]
        soup = beautiful_soup(data)
        finished = False

        divs = soup.find_all("div", attrs={"class": "WB_feed_type"}, mid=True)
        max_id = None
        next_urls = []
        for div in divs:
            mid = div["mid"]
            if len(mid) == 0:
                continue
            max_id = mid

            if "end_id" not in params:
                params["end_id"] = mid
            if mid in weibo_user.newest_mids:
                finished = True
                break
            if len(self.bundle.newest_mids) < 3:
                self.bundle.newest_mids.append(mid)

            try:
                mblog = getattr(MicroBlog, "objects").get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
            content_div = div.find("div", attrs={"class": "WB_text", "node-type": "feed_list_content"})
            for img in content_div.find_all("img", attrs={"type": "face"}):
                img.replace_with(img["title"])
            mblog.content = content_div.text
            is_forward = div.get("isforward") == "1"
            if is_forward:
                name_a = div.find("a", attrs={"class": "WB_name", "node-type": "feed_list_originNick"})
                text_a = div.find("div", attrs={"class": "WB_text", "node-type": "feed_list_reason"})
                if name_a is not None and text_a is not None:
                    mblog.forward = "%s: %s" % (name_a.text, text_a.text)
            mblog.created = parse(div.select("a.S_link2.WB_time")[0]["title"])

            if self.bundle.last_update is None or mblog.created > self.bundle.last_update:
                self.bundle.last_update = mblog.created
            if weibo_user.last_update is not None and mblog.created <= weibo_user.last_update:
                finished = True
                break

            likes = div.find("a", attrs={"action-type": "feed_list_like"}).text
            likes = likes.strip("(").strip(")")
            likes = 0 if len(likes) == 0 else int(likes)
            mblog.n_likes = likes
            forwards = div.find("a", attrs={"action-type": "feed_list_forward"}).text
            if "(" not in forwards:
                mblog.n_forwards = 0
            else:
                mblog.n_forwards = int(forwards.strip().split("(", 1)[1].strip(")"))
            comments = div.find("a", attrs={"action-type": "feed_list_comment"}).text
            if "(" not in comments:
                mblog.n_comments = 0
            else:
                mblog.n_comments = int(comments.strip().split("(", 1)[1].strip(")"))

            # fetch geo info
            map_info = div.find("div", attrs={"class": "map_data"})
            if map_info is not None:
                geo = Geo()
                geo.location = map_info.text.split("-")[0].strip()
                geo_info = urldecode("?" + map_info.find("a")["action-data"])["geo"]
                geo.longtitude, geo.latitude = tuple([float(itm) for itm in geo_info.split(",", 1)])
                mblog.geo = geo

            # fetch forwards and comments
            if fetch_forward or fetch_comment or fetch_like:
                query = {"id": mid, "_t": 0, "__rnd": int(time.time() * 1000)}
                query_str = urllib.urlencode(query)
                if fetch_forward and mblog.n_forwards > 0:
                    forward_url = "http://weibo.com/aj/comment/big?%s" % query_str
                    next_urls.append(forward_url)
                if fetch_comment and mblog.n_comments > 0:
                    comment_url = "http://weibo.com/aj/mblog/info/big?%s" % query_str
                    next_urls.append(comment_url)
                if fetch_like and mblog.n_likes > 0:
                    query = {"mid": mid, "_t": 0, "__rnd": int(time.time() * 1000)}
                    query_str = urllib.urlencode(query)
                    like_url = "http://weibo.com/aj/like/big?%s" % query_str
                    next_urls.append(like_url)

            mblog.save()

        if "pagebar" in params:
            params["max_id"] = max_id
        else:
            del params["max_id"]
        self.logger.debug("parse %s finish" % url)

        # if not has next page
        if len(divs) == 0 or finished:
            weibo_user = self.get_weibo_user()
            for mid in self.bundle.newest_mids:
                if mid not in self.bundle.newest_mids:
                    weibo_user.newest_mids.append(mid)
            while len(weibo_user.newest_mids) > 3:
                weibo_user.newest_mids.pop()
            weibo_user.last_update = self.bundle.last_update
            weibo_user.save()
            return [], []

        next_urls.append("%s?%s" % (url.split("?")[0], urllib.urlencode(params)))
        return next_urls, []
コード例 #3
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []
        
        url = url or self.url
        params = urldecode(url)
        br = self.opener.browse_open(url)
        self.logger.debug('load %s finish' % url)
        
        if not self.check(url, br):
            return [], []
            
        weibo_user = self.get_weibo_user()
        
        params['_t'] = 0
        params['__rnd'] = str(int(time.time() * 1000))
        page = int(params.get('page', 1))
        pre_page = int(params.get('pre_page', 0))
        count = 15
        if 'pagebar' not in params:
            params['pagebar'] = '0'
            pre_page += 1
        elif params['pagebar'] == '0':
            params['pagebar'] = '1'
        elif params['pagebar'] == '1':
            del params['pagebar']
            pre_page = page
            page += 1
            count = 50
        params['count'] = count
        params['page'] = page
        params['pre_page'] = pre_page
        
        data = json.loads(br.response().read())['data']
        soup = beautiful_soup(data)
        finished = False
        
        divs = soup.find_all('div', attrs={'class': 'WB_feed_type'},  mid=True)
        max_id = None
        next_urls = []
        for div in divs:
            mid = div['mid']
            if len(mid) == 0:
                continue
            max_id = mid
            
            if 'end_id' not in params:
                params['end_id'] = mid
            if mid in weibo_user.newest_mids:
                finished = True
                break
            if len(self.bundle.newest_mids) < 3:
                self.bundle.newest_mids.append(mid)
            
            try:
                mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
            content_div = div.find('div', attrs={
                'class': 'WB_text', 
                'node-type': 'feed_list_content'
            })
            for img in content_div.find_all("img", attrs={'type': 'face'}):
                img.replace_with(img['title']);
            mblog.content = content_div.text
            is_forward = div.get('isforward') == '1'
            if is_forward:
                mblog.omid = div['omid']
                name_a = div.find('a', attrs={
                    'class': 'WB_name', 
                    'node-type': 'feed_list_originNick'
                })
                text_a = div.find('div', attrs={
                    'class': 'WB_text',
                    'node-type': 'feed_list_reason'
                })
                if name_a is not None and text_a is not None:
                    mblog.forward = '%s: %s' % (
                        name_a.text,
                        text_a.text
                    )
            #mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title'])
            #ci
            #
            temp = parse(div.select('a.S_link2.WB_time')[0]['title'])
            tempstring = temp.strftime("%Y-%m-%d-%H-%M-%S")
            list=tempstring.split('-')
            tempyear=list[0]
            tempmonth=list[1]
            tempday=list[2]
            temphour=list[3]
            tempmin=list[4]
            tempsec=list[5]
            temptime=time.mktime(datetime(int(tempyear),int(tempmonth),int(tempday),int(temphour),int(tempmin),int(tempsec)).timetuple())
            print temptime
            
            timevalue=open("D:\\09Limited_buffer\\earlywarningbyci\\cola\\contrib\\weibo\\timevalue.txt","r")
            time_re=timevalue.readline()
            timevalue.close()
            list=time_re.split()
            starttime=list[0]
            endtime=list[1]
            print starttime
            temptime=round(float(temptime))
            starttime=round(float(starttime))
            endtime=round(float(endtime))
            if temptime>=starttime and temptime<=endtime:
                mblog.created = temp
                #timeok = True
                print "------OKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOK-----"
            else:
                if temptime<starttime:
                    print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
                    time.sleep(5)
                    return [], []
                #continue
            #
            # 
            if self.bundle.last_update is None or mblog.created > self.bundle.last_update:
                self.bundle.last_update = mblog.created
            if weibo_user.last_update is not None and \
                mblog.created <= weibo_user.last_update:
                finished = True
                break

            func_div = div.find_all('div', 'WB_func')[-1]
            action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t)
            
            likes = func_div.find('a', attrs={'action-type': action_type_re("like")}).text
            likes = likes.strip('(').strip(')')
            likes = 0 if len(likes) == 0 else int(likes)
            mblog.n_likes = likes
            forwards = func_div.find('a', attrs={'action-type': action_type_re("forward")}).text
            if '(' not in forwards:
                mblog.n_forwards = 0
            else:
                mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')'))
            comments = func_div.find('a', attrs={'action-type': action_type_re('comment')}).text
            if '(' not in comments:
                mblog.n_comments = 0
            else:
                mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')'))
                
            # fetch geo info
            map_info = div.find("div", attrs={'class': 'map_data'})
            if map_info is not None:
                geo = Geo()
                geo.location = map_info.text.split('-')[0].strip()
                geo_info = urldecode("?"+map_info.find('a')['action-data'])['geo']
                geo.longtitude, geo.latitude = tuple([float(itm) for itm in geo_info.split(',', 1)])
                mblog.geo = geo
            
            # fetch forwards and comments
            if fetch_forward or fetch_comment or fetch_like:
                query = {'id': mid, '_t': 0, '__rnd': int(time.time()*1000)}
                query_str = urllib.urlencode(query)
                if fetch_forward and mblog.n_forwards > 0:
                    forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str
                    next_urls.append(forward_url)
                if fetch_comment and mblog.n_comments > 0:
                    comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str
                    next_urls.append(comment_url)
                if fetch_like and mblog.n_likes > 0:
                    query = {'mid': mid, '_t': 0, '__rnd': int(time.time()*1000)}
                    query_str = urllib.urlencode(query)
                    like_url = 'http://weibo.com/aj/like/big?%s' % query_str
                    next_urls.append(like_url)
            
            mblog.save()
        
        if 'pagebar' in params:
            params['max_id'] = max_id
        else:
            del params['max_id']
        self.logger.debug('parse %s finish' % url)
                
        # if not has next page
        if len(divs) == 0 or finished:
            weibo_user = self.get_weibo_user()
            for mid in self.bundle.newest_mids:
                if mid not in weibo_user.newest_mids:
                    weibo_user.newest_mids.append(mid)
            while len(weibo_user.newest_mids) > 3:
                weibo_user.newest_mids.pop()
            weibo_user.last_update = self.bundle.last_update
            weibo_user.save()
            return [], []
        
        next_urls.append('%s?%s'%(url.split('?')[0], urllib.urlencode(params)))
        return next_urls, []