def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url try: br = self.opener.browse_open(url) except Exception as e: print(e) print('休息10分钟!') time.sleep(60 * 10) try: jsn = json.loads(br.response().read()) except ValueError: print('休息10分钟!') time.sleep(60 * 10) raise FetchBannedError('fetch banned by weibo server') # self.logger.debug('load %s finish' % url) try: soup = beautiful_soup(jsn['data']['html']) current_page = jsn['data']['page']['pagenum'] n_pages = jsn['data']['page']['totalpage'] except KeyError: print('休息10分钟!') time.sleep(60 * 10) raise FetchBannedError('fetch banned by weibo server') if not self.check(url, br): return decodes = urldecode(url) mid = decodes.get('id', decodes.get('mid')) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find('dt').find('img')['src'] date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text date = date.strip().strip('(').strip(')') instance.created = self.parse_datetime(date) for div in dl.find_all('div'): div.extract() for span in dl.find_all('span'): span.extract() instance.content = dl.text.strip() counter_type = None #print(u'微博:'+mblog.content+u'的评论') if url.startswith('http://weibo.com/aj/comment'): counter_type = 'comment' dls = soup.find_all('dl', mid=True) for dl in dls: uid = dl.find('a', usercard=True)['usercard'].split("id=", 1)[1] comment = Comment(uid=uid) set_instance(comment, dl) #print(u'微博评论:'+comment.content) mblog.comments.append(comment) elif url.startswith('http://weibo.com/aj/mblog/info'): counter_type = 'forward' dls = soup.find_all('dl', mid=True) for dl in dls: forward_again_a = dl.find( 'a', attrs={ 'action-type': re.compile("^(feed_list|fl)_forward$") }) uid = urldecode('?%s' % forward_again_a['action-data'])['uid'] forward = Forward(uid=uid, mid=dl['mid']) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith('http://weibo.com/aj/like'): counter_type = 'like' lis = soup.find_all('li', uid=True) for li in lis: like = Like(uid=li['uid']) like.avatar = li.find('img')['src'] mblog.likes.append(like) mblog.save() # self.logger.debug('parse %s finish' % url) # counter add one for the processed forward or comment or like list url if counter_type is not None: self.counter.inc('processed_%s_list_page' % counter_type, 1) if current_page >= n_pages: return params = urldecode(url) new_params = urldecode('?page=%s' % (current_page + 1)) params.update(new_params) params['__rnd'] = int(time.time() * 1000) next_page = '%s?%s' % (url.split('?')[0], urllib.urlencode(params)) yield next_page
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = None jsn = None try: br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) jsn = json.loads(br.response().read()) except (ValueError, URLError) as e: return self._error(url, e) soup = beautiful_soup(jsn['data']['html']) current_page = jsn['data']['page']['pagenum'] n_pages = jsn['data']['page']['totalpage'] if not self.check(url, br): return [], [] decodes = urldecode(url) mid = decodes.get('id', decodes.get('mid')) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find('dt').find('img')['src'] date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text date = date.strip().strip('(').strip(')') instance.created = self.parse_datetime(date) for div in dl.find_all('div'): div.extract() for span in dl.find_all('span'): span.extract() instance.content = dl.text.strip() if url.startswith('http://weibo.com/aj/comment'): dls = soup.find_all('dl', mid=True) for dl in dls: comment = Comment(uid=self.uid) set_instance(comment, dl) mblog.comments.append(comment) elif url.startswith('http://weibo.com/aj/mblog/info'): dls = soup.find_all('dl', mid=True) for dl in dls: forward = Forward(uid=self.uid, mid=dl['mid']) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith('http://weibo.com/aj/like'): lis = soup.find_all('li', uid=True) for li in lis: like = Like(uid=li['uid']) like.avatar = li.find('img')['src'] mblog.likes.append(like) try: mblog.save() self.logger.debug('parse %s finish' % url) except ValidationError, e: return self._error(url, e)