def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url try: br = self.opener.browse_open(url) except Exception as e: print(e) print('休息10分钟!') time.sleep(60 * 10) try: jsn = json.loads(br.response().read()) except ValueError: print('休息10分钟!') time.sleep(60 * 10) raise FetchBannedError('fetch banned by weibo server') # self.logger.debug('load %s finish' % url) try: soup = beautiful_soup(jsn['data']['html']) current_page = jsn['data']['page']['pagenum'] n_pages = jsn['data']['page']['totalpage'] except KeyError: print('休息10分钟!') time.sleep(60 * 10) raise FetchBannedError('fetch banned by weibo server') if not self.check(url, br): return decodes = urldecode(url) mid = decodes.get('id', decodes.get('mid')) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find('dt').find('img')['src'] date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text date = date.strip().strip('(').strip(')') instance.created = self.parse_datetime(date) for div in dl.find_all('div'): div.extract() for span in dl.find_all('span'): span.extract() instance.content = dl.text.strip() counter_type = None #print(u'微博:'+mblog.content+u'的评论') if url.startswith('http://weibo.com/aj/comment'): counter_type = 'comment' dls = soup.find_all('dl', mid=True) for dl in dls: uid = dl.find('a', usercard=True)['usercard'].split("id=", 1)[1] comment = Comment(uid=uid) set_instance(comment, dl) #print(u'微博评论:'+comment.content) mblog.comments.append(comment) elif url.startswith('http://weibo.com/aj/mblog/info'): counter_type = 'forward' dls = soup.find_all('dl', mid=True) for dl in dls: forward_again_a = dl.find( 'a', attrs={ 'action-type': re.compile("^(feed_list|fl)_forward$") }) uid = urldecode('?%s' % forward_again_a['action-data'])['uid'] forward = Forward(uid=uid, mid=dl['mid']) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith('http://weibo.com/aj/like'): counter_type = 'like' lis = soup.find_all('li', uid=True) for li in lis: like = Like(uid=li['uid']) like.avatar = li.find('img')['src'] mblog.likes.append(like) mblog.save() # self.logger.debug('parse %s finish' % url) # counter add one for the processed forward or comment or like list url if counter_type is not None: self.counter.inc('processed_%s_list_page' % counter_type, 1) if current_page >= n_pages: return params = urldecode(url) new_params = urldecode('?page=%s' % (current_page + 1)) params.update(new_params) params['__rnd'] = int(time.time() * 1000) next_page = '%s?%s' % (url.split('?')[0], urllib.urlencode(params)) yield next_page
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = None jsn = None try: br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) jsn = json.loads(br.response().read()) except (ValueError, URLError) as e: return self._error(url, e) soup = beautiful_soup(jsn['data']['html']) current_page = jsn['data']['page']['pagenum'] n_pages = jsn['data']['page']['totalpage'] if not self.check(url, br): return [], [] decodes = urldecode(url) mid = decodes.get('id', decodes.get('mid')) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find('dt').find('img')['src'] date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text date = date.strip().strip('(').strip(')') instance.created = self.parse_datetime(date) for div in dl.find_all('div'): div.extract() for span in dl.find_all('span'): span.extract() instance.content = dl.text.strip() if url.startswith('http://weibo.com/aj/comment'): dls = soup.find_all('dl', mid=True) for dl in dls: comment = Comment(uid=self.uid) set_instance(comment, dl) mblog.comments.append(comment) elif url.startswith('http://weibo.com/aj/mblog/info'): dls = soup.find_all('dl', mid=True) for dl in dls: forward = Forward(uid=self.uid, mid=dl['mid']) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith('http://weibo.com/aj/like'): lis = soup.find_all('li', uid=True) for li in lis: like = Like(uid=li['uid']) like.avatar = li.find('img')['src'] mblog.likes.append(like) try: mblog.save() self.logger.debug('parse %s finish' % url) except ValidationError, e: return self._error(url, e)
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = None jsn = None try: br = self.opener.browse_open(url) self.logger.debug("load %s finish" % url) jsn = json.loads(br.response().read()) except (ValueError, URLError) as e: return self._error(url, e) soup = beautiful_soup(jsn["data"]["html"]) current_page = jsn["data"]["page"]["pagenum"] n_pages = jsn["data"]["page"]["totalpage"] if not self.check(url, br): return [], [] decodes = urldecode(url) mid = decodes.get("id", decodes.get("mid")) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, "objects").get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find("dt").find("img")["src"] date = dl.find("dd").find("span", attrs={"class": "S_txt2"}).text date = date.strip().strip("(").strip(")") instance.created = self.parse_datetime(date) for div in dl.find_all("div"): div.extract() for span in dl.find_all("span"): span.extract() instance.content = dl.text.strip() if url.startswith("http://weibo.com/aj/comment"): dls = soup.find_all("dl", mid=True) for dl in dls: comment = Comment(uid=self.uid) set_instance(comment, dl) mblog.comments.append(comment) elif url.startswith("http://weibo.com/aj/mblog/info"): dls = soup.find_all("dl", mid=True) for dl in dls: forward = Forward(uid=self.uid, mid=dl["mid"]) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith("http://weibo.com/aj/like"): lis = soup.find_all("li", uid=True) for li in lis: like = Like(uid=li["uid"]) like.avatar = li.find("img")["src"] mblog.likes.append(like) try: mblog.save() self.logger.debug("parse %s finish" % url) except ValidationError, e: return self._error(url, e)
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = None jsn = None try: br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) jsn = json.loads(br.response().read()) except (ValueError, URLError) as e: return self._error(url, e) soup = beautiful_soup(jsn['data']['html']) current_page = jsn['data']['page']['pagenum'] n_pages = jsn['data']['page']['totalpage'] if not self.check(url, br): return [], [] decodes = urldecode(url) mid = decodes.get('id', decodes.get('mid')) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find('dt').find('img')['src'] date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text date = date.strip().strip('(').strip(')') instance.created = self.parse_datetime(date) for div in dl.find_all('div'): div.extract() for span in dl.find_all('span'): span.extract() instance.content = dl.text.strip() if url.startswith('http://weibo.com/aj/comment'): dls = soup.find_all('dl', mid=True) for dl in dls: comment = Comment(uid=self.uid) set_instance(comment, dl) mblog.comments.append(comment) elif url.startswith('http://weibo.com/aj/mblog/info'): dls = soup.find_all('dl', mid=True) for dl in dls: forward = Forward(uid=self.uid, mid=dl['mid']) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith('http://weibo.com/aj/like'): lis = soup.find_all('li', uid=True) for li in lis: like = Like(uid=li['uid']) like.avatar = li.find('img')['src'] mblog.likes.append(like) try: mblog.save() self.logger.debug('parse %s finish' % url) except ValidationError, e: return self._error(url, e)
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url br = self.opener.browse_open(url) try: jsn = json.loads(br.response().read()) except ValueError: raise FetchBannedError('fetch banned by weibo server') # self.logger.debug('load %s finish' % url) try: soup = beautiful_soup(jsn['data']['html']) current_page = jsn['data']['page']['pagenum'] n_pages = jsn['data']['page']['totalpage'] except KeyError: raise FetchBannedError('fetch banned by weibo server') if not self.check(url, br): return decodes = urldecode(url) mid = decodes.get('id', decodes.get('mid')) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find('dt').find('img')['src'] date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text date = date.strip().strip('(').strip(')') instance.created = self.parse_datetime(date) for div in dl.find_all('div'): div.extract() for span in dl.find_all('span'): span.extract() instance.content = dl.text.strip() counter_type = None if url.startswith('http://weibo.com/aj/comment'): counter_type = 'comment' dls = soup.find_all('dl', mid=True) for dl in dls: uid = dl.find('a', usercard=True)['usercard'].split("id=", 1)[1] comment = Comment(uid=uid) set_instance(comment, dl) mblog.comments.append(comment) elif url.startswith('http://weibo.com/aj/mblog/info'): counter_type = 'forward' dls = soup.find_all('dl', mid=True) for dl in dls: forward_again_a = dl.find('a', attrs={'action-type': re.compile("^(feed_list|fl)_forward$")}) uid = urldecode('?%s' % forward_again_a['action-data'])['uid'] forward = Forward(uid=uid, mid=dl['mid']) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith('http://weibo.com/aj/like'): counter_type = 'like' lis = soup.find_all('li', uid=True) for li in lis: like = Like(uid=li['uid']) like.avatar = li.find('img')['src'] mblog.likes.append(like) mblog.save() # self.logger.debug('parse %s finish' % url) # counter add one for the processed forward or comment or like list url if counter_type is not None: self.counter.inc('processed_%s_list_page' % counter_type, 1) if current_page >= n_pages: return params = urldecode(url) new_params = urldecode('?page=%s'%(current_page+1)) params.update(new_params) params['__rnd'] = int(time.time()*1000) next_page = '%s?%s' % (url.split('?')[0] , urllib.urlencode(params)) yield next_page
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = self.opener.browse_open(url) jsn = json.loads(br.response().read()) soup = BeautifulSoup(jsn['data']['html']) current_page = jsn['data']['page']['pagenum'] n_pages = jsn['data']['page']['totalpage'] if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() decodes = urldecode(url) mid = decodes.get('id', decodes.get('mid')) mblogs = weibo_user.statuses mblog = None for m in mblogs: if m.mid == mid: mblog = m break if mblog is None: mblog = MicroBlog(mid=mid) weibo_user.statuses.append(mblog) def set_instance(instance, dl): instance.avatar = dl.find('dt').find('img')['src'] date = dl.find('dd').find('span', attrs={'class': 'S_txt2'}).text date = date.strip().strip('(').strip(')') instance.created = self.parse_datetime(date) for div in dl.find_all('div'): div.extract() for span in dl.find_all('span'): span.extract() instance.content = dl.text.strip() if url.startswith('http://weibo.com/aj/comment'): dls = soup.find_all('dl', mid=True) for dl in dls: comment = Comment(uid=self.uid) set_instance(comment, dl) mblog.comments.append(comment) elif url.startswith('http://weibo.com/aj/mblog/info'): dls = soup.find_all('dl', mid=True) for dl in dls: forward = Forward(uid=self.uid, mid=dl['mid']) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith('http://weibo.com/aj/like'): lis = soup.find_all('li', uid=True) for li in lis: like = Like(uid=li['uid']) like.avatar = li.find('img')['src'] mblog.likes.append(like) weibo_user.save() if current_page >= n_pages: return [], [] params = urldecode(url) next_page = soup.find('a', attrs={'class': 'btn_page_next'}) if next_page is not None: try: next_page_str = next_page['action-data'] except KeyError: next_page_str = next_page.find('span')['action-data'] new_params = urldecode('?%s'%next_page_str) params.update(new_params) params['__rnd'] = int(time.time()*1000) next_page = '%s?%s' % (url.split('?')[0] , urllib.urlencode(params)) return [next_page, ], [] return [], []
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url br = self.opener.browse_open(url) jsn = json.loads(br.response().read()) # self.logger.debug('load %s finish' % url) soup = beautiful_soup(jsn["data"]["html"]) current_page = jsn["data"]["page"]["pagenum"] n_pages = jsn["data"]["page"]["totalpage"] if not self.check(url, br): return decodes = urldecode(url) mid = decodes.get("id", decodes.get("mid")) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, "objects").get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find("dt").find("img")["src"] date = dl.find("dd").find(attrs={"class": "S_txt2"}).text date = date.strip().strip("(").strip(")") instance.created = self.parse_datetime(date) for div in dl.find_all("div"): div.extract() for span in dl.find_all("span"): span.extract() instance.content = dl.text.strip() counter_type = None if url.startswith("http://weibo.com/aj/comment"): counter_type = "comment" dls = soup.find_all("dl", mid=True) for dl in dls: uid = dl.find("a", usercard=True)["usercard"].split("id=", 1)[1] comment = Comment(uid=uid) set_instance(comment, dl) mblog.comments.append(comment) elif url.startswith("http://weibo.com/aj/mblog/info"): counter_type = "forward" dls = soup.find_all("dl", mid=True) for dl in dls: forward_again_a = dl.find("a", attrs={"action-type": re.compile("^(feed_list|fl)_forward$")}) uid = urldecode("?%s" % forward_again_a["action-data"])["uid"] forward = Forward(uid=uid, mid=dl["mid"]) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith("http://weibo.com/aj/like"): counter_type = "like" lis = soup.find_all("li", uid=True) for li in lis: like = Like(uid=li["uid"]) like.avatar = li.find("img")["src"] mblog.likes.append(like) mblog.save() # self.logger.debug('parse %s finish' % url) # counter add one for the processed forward or comment or like list url if counter_type is not None: self.counter.inc("processed_%s_list_page" % counter_type, 1) if current_page >= n_pages: return params = urldecode(url) new_params = urldecode("?page=%s" % (current_page + 1)) params.update(new_params) params["__rnd"] = int(time.time() * 1000) next_page = "%s?%s" % (url.split("?")[0], urllib.urlencode(params)) yield next_page
mblog.forwards.append(forward) self.bundle.fetched_last_forward_id = mblog.mid self.bundle.fetched_weibo_forward_num = self.bundle.fetched_weibo_forward_num + 1; elif url.startswith('http://weibo.com/aj/like'): lis = soup.find_all('li', uid=True) for li in lis: if fetch_like_limit > 0 and self.bundle.fetched_weibo_like_num >= fetch_like_limit: self.bundle.fetched_weibo_like_num = 0; try: mblog.save() self.logger.debug('parse %s finish' % url) except ValidationError, e: return self._error(url, e) return [],[] like = Like(uid=li['uid']) like.avatar = li.find('img')['src'] if fetch_like_limit > 0 and self.bundle.fetched_last_like_id != mblog.mid: self.bundle.fetched_weibo_like_num = 0; mblog.likes.append(like) self.bundle.fetched_last_like_id = mblog.mid self.bundle.fetched_weibo_like_num = self.bundle.fetched_weibo_like_num + 1; try: mblog.save() self.logger.debug('parse %s finish' % url) except ValidationError, e: return self._error(url, e) if current_page >= n_pages: return [], []