def get_weibo(self, mid, keyword): try: weibo = getattr(MicroBlog, 'objects').get(Q(mid=mid) & Q(keyword=keyword)) return weibo, True except DoesNotExist: weibo = MicroBlog(mid=mid, keyword=keyword) weibo.save() return weibo, False
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url params = urldecode(url) br = self.opener.browse_open(url) self.logger.debug("load %s finish" % url) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() params["_t"] = 0 params["__rnd"] = str(int(time.time() * 1000)) page = int(params.get("page", 1)) pre_page = int(params.get("pre_page", 0)) count = 15 if "pagebar" not in params: params["pagebar"] = "0" pre_page += 1 elif params["pagebar"] == "0": params["pagebar"] = "1" elif params["pagebar"] == "1": del params["pagebar"] pre_page = page page += 1 count = 50 params["count"] = count params["page"] = page params["pre_page"] = pre_page data = json.loads(br.response().read())["data"] soup = beautiful_soup(data) finished = False divs = soup.find_all("div", attrs={"class": "WB_feed_type"}, mid=True) max_id = None next_urls = [] for div in divs: mid = div["mid"] if len(mid) == 0: continue max_id = mid if "end_id" not in params: params["end_id"] = mid if mid in weibo_user.newest_mids: finished = True break if len(self.bundle.newest_mids) < 3: self.bundle.newest_mids.append(mid) try: mblog = getattr(MicroBlog, "objects").get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) content_div = div.find("div", attrs={"class": "WB_text", "node-type": "feed_list_content"}) for img in content_div.find_all("img", attrs={"type": "face"}): img.replace_with(img["title"]) mblog.content = content_div.text is_forward = div.get("isforward") == "1" if is_forward: name_a = div.find("a", attrs={"class": "WB_name", "node-type": "feed_list_originNick"}) text_a = div.find("div", attrs={"class": "WB_text", "node-type": "feed_list_reason"}) if name_a is not None and text_a is not None: mblog.forward = "%s: %s" % (name_a.text, text_a.text) mblog.created = parse(div.select("a.S_link2.WB_time")[0]["title"]) if self.bundle.last_update is None or mblog.created > self.bundle.last_update: self.bundle.last_update = mblog.created if weibo_user.last_update is not None and mblog.created <= weibo_user.last_update: finished = True break likes = div.find("a", attrs={"action-type": "feed_list_like"}).text likes = likes.strip("(").strip(")") likes = 0 if len(likes) == 0 else int(likes) mblog.n_likes = likes forwards = div.find("a", attrs={"action-type": "feed_list_forward"}).text if "(" not in forwards: mblog.n_forwards = 0 else: mblog.n_forwards = int(forwards.strip().split("(", 1)[1].strip(")")) comments = div.find("a", attrs={"action-type": "feed_list_comment"}).text if "(" not in comments: mblog.n_comments = 0 else: mblog.n_comments = int(comments.strip().split("(", 1)[1].strip(")")) # fetch geo info map_info = div.find("div", attrs={"class": "map_data"}) if map_info is not None: geo = Geo() geo.location = map_info.text.split("-")[0].strip() geo_info = urldecode("?" + map_info.find("a")["action-data"])["geo"] geo.longtitude, geo.latitude = tuple([float(itm) for itm in geo_info.split(",", 1)]) mblog.geo = geo # fetch forwards and comments if fetch_forward or fetch_comment or fetch_like: query = {"id": mid, "_t": 0, "__rnd": int(time.time() * 1000)} query_str = urllib.urlencode(query) if fetch_forward and mblog.n_forwards > 0: forward_url = "http://weibo.com/aj/comment/big?%s" % query_str next_urls.append(forward_url) if fetch_comment and mblog.n_comments > 0: comment_url = "http://weibo.com/aj/mblog/info/big?%s" % query_str next_urls.append(comment_url) if fetch_like and mblog.n_likes > 0: query = {"mid": mid, "_t": 0, "__rnd": int(time.time() * 1000)} query_str = urllib.urlencode(query) like_url = "http://weibo.com/aj/like/big?%s" % query_str next_urls.append(like_url) mblog.save() if "pagebar" in params: params["max_id"] = max_id else: del params["max_id"] self.logger.debug("parse %s finish" % url) # if not has next page if len(divs) == 0 or finished: weibo_user = self.get_weibo_user() for mid in self.bundle.newest_mids: if mid not in self.bundle.newest_mids: weibo_user.newest_mids.append(mid) while len(weibo_user.newest_mids) > 3: weibo_user.newest_mids.pop() weibo_user.last_update = self.bundle.last_update weibo_user.save() return [], [] next_urls.append("%s?%s" % (url.split("?")[0], urllib.urlencode(params))) return next_urls, []
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = None jsn = None try: br = self.opener.browse_open(url) self.logger.debug("load %s finish" % url) jsn = json.loads(br.response().read()) except (ValueError, URLError) as e: return self._error(url, e) soup = beautiful_soup(jsn["data"]["html"]) current_page = jsn["data"]["page"]["pagenum"] n_pages = jsn["data"]["page"]["totalpage"] if not self.check(url, br): return [], [] decodes = urldecode(url) mid = decodes.get("id", decodes.get("mid")) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, "objects").get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find("dt").find("img")["src"] date = dl.find("dd").find("span", attrs={"class": "S_txt2"}).text date = date.strip().strip("(").strip(")") instance.created = self.parse_datetime(date) for div in dl.find_all("div"): div.extract() for span in dl.find_all("span"): span.extract() instance.content = dl.text.strip() if url.startswith("http://weibo.com/aj/comment"): dls = soup.find_all("dl", mid=True) for dl in dls: comment = Comment(uid=self.uid) set_instance(comment, dl) mblog.comments.append(comment) elif url.startswith("http://weibo.com/aj/mblog/info"): dls = soup.find_all("dl", mid=True) for dl in dls: forward = Forward(uid=self.uid, mid=dl["mid"]) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith("http://weibo.com/aj/like"): lis = soup.find_all("li", uid=True) for li in lis: like = Like(uid=li["uid"]) like.avatar = li.find("img")["src"] mblog.likes.append(like) try: mblog.save() self.logger.debug("parse %s finish" % url) except ValidationError, e: return self._error(url, e)
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url params = urldecode(url) br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() params['_t'] = 0 params['__rnd'] = str(int(time.time() * 1000)) page = int(params.get('page', 1)) pre_page = int(params.get('pre_page', 0)) count = 15 if 'pagebar' not in params: params['pagebar'] = '0' pre_page += 1 elif params['pagebar'] == '0': params['pagebar'] = '1' elif params['pagebar'] == '1': del params['pagebar'] pre_page = page page += 1 count = 50 params['count'] = count params['page'] = page params['pre_page'] = pre_page data = json.loads(br.response().read())['data'] soup = beautiful_soup(data) finished = False divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True) max_id = None next_urls = [] for div in divs: mid = div['mid'] if len(mid) == 0: continue max_id = mid if 'end_id' not in params: params['end_id'] = mid if mid in weibo_user.newest_mids: finished = True break if len(self.bundle.newest_mids) < 3: self.bundle.newest_mids.append(mid) try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) content_div = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_content' }) for img in content_div.find_all("img", attrs={'type': 'face'}): img.replace_with(img['title']); mblog.content = content_div.text is_forward = div.get('isforward') == '1' if is_forward: name_a = div.find('a', attrs={ 'class': 'WB_name', 'node-type': 'feed_list_originNick' }) text_a = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_reason' }) if name_a is not None and text_a is not None: mblog.forward = '%s: %s' % ( name_a.text, text_a.text ) mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title']) if self.bundle.last_update is None or mblog.created > self.bundle.last_update: self.bundle.last_update = mblog.created if weibo_user.last_update is not None and \ mblog.created <= weibo_user.last_update: finished = True break likes = div.find('a', attrs={'action-type': 'feed_list_like'}).text likes = likes.strip('(').strip(')') likes = 0 if len(likes) == 0 else int(likes) mblog.n_likes = likes forwards = div.find('a', attrs={'action-type': 'feed_list_forward'}).text if '(' not in forwards: mblog.n_forwards = 0 else: mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')')) comments = div.find('a', attrs={'action-type': 'feed_list_comment'}).text if '(' not in comments: mblog.n_comments = 0 else: mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')')) # fetch geo info map_info = div.find("div", attrs={'class': 'map_data'}) if map_info is not None: geo = Geo() geo.location = map_info.text.split('-')[0].strip() geo_info = urldecode("?"+map_info.find('a')['action-data'])['geo'] geo.longtitude, geo.latitude = tuple([float(itm) for itm in geo_info.split(',', 1)]) mblog.geo = geo # fetch forwards and comments if fetch_forward or fetch_comment or fetch_like: query = {'id': mid, '_t': 0, '__rnd': int(time.time()*1000)} query_str = urllib.urlencode(query) if fetch_forward and mblog.n_forwards > 0: forward_url = 'http://weibo.com/aj/comment/big?%s' % query_str next_urls.append(forward_url) if fetch_comment and mblog.n_comments > 0: comment_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str next_urls.append(comment_url) if fetch_like and mblog.n_likes > 0: query = {'mid': mid, '_t': 0, '__rnd': int(time.time()*1000)} query_str = urllib.urlencode(query) like_url = 'http://weibo.com/aj/like/big?%s' % query_str next_urls.append(like_url) mblog.save() if 'pagebar' in params: params['max_id'] = max_id else: del params['max_id'] self.logger.debug('parse %s finish' % url) # if not has next page if len(divs) == 0 or finished: weibo_user = self.get_weibo_user() for mid in self.bundle.newest_mids: if mid not in self.bundle.newest_mids: weibo_user.newest_mids.append(mid) while len(weibo_user.newest_mids) > 3: weibo_user.newest_mids.pop() weibo_user.last_update = self.bundle.last_update weibo_user.save() return [], [] next_urls.append('%s?%s'%(url.split('?')[0], urllib.urlencode(params))) return next_urls, []
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = None jsn = None try: br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) jsn = json.loads(br.response().read()) except (ValueError, URLError) as e: return self._error(url, e) soup = beautiful_soup(jsn['data']['html']) current_page = jsn['data']['page']['pagenum'] n_pages = jsn['data']['page']['totalpage'] if not self.check(url, br): return [], [] decodes = urldecode(url) mid = decodes.get('id', decodes.get('mid')) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find('dt').find('img')['src'] date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text date = date.strip().strip('(').strip(')') instance.created = self.parse_datetime(date) for div in dl.find_all('div'): div.extract() for span in dl.find_all('span'): span.extract() instance.content = dl.text.strip() if url.startswith('http://weibo.com/aj/comment'): dls = soup.find_all('dl', mid=True) for dl in dls: comment = Comment(uid=self.uid) set_instance(comment, dl) mblog.comments.append(comment) elif url.startswith('http://weibo.com/aj/mblog/info'): dls = soup.find_all('dl', mid=True) for dl in dls: forward = Forward(uid=self.uid, mid=dl['mid']) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith('http://weibo.com/aj/like'): lis = soup.find_all('li', uid=True) for li in lis: like = Like(uid=li['uid']) like.avatar = li.find('img')['src'] mblog.likes.append(like) try: mblog.save() self.logger.debug('parse %s finish' % url) except ValidationError, e: return self._error(url, e)
def save_blog_detail(self, div, mid): try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) content_div = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_content' }) blog_create_date = parse( div.find('a', attrs={'node-type': 'feed_list_item_date'})['title']) for img in content_div.find_all("img", attrs={'type': 'face'}): img.replace_with(img['title']) mblog.content = content_div.text is_forward = div.get('isforward') if is_forward: # write origional user, msg mblog.omid = div['omid'] tbinfos = div['tbinfo'].split('&') mblog.ouid = tbinfos[0].split('=')[1] name_a = div.find('a', attrs={ 'class': 'WB_name', 'node-type': 'feed_list_originNick' }) text_a = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_reason' }) if name_a is not None and text_a is not None: mblog.forward = '%s: %s' % (name_a.text, text_a.text) mblog.created = blog_create_date mblog.last_update = datetime.now() func_div = div.find_all('div', attrs={'node-type': 'feed_list_options'})[-1] action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t) likes = func_div.find('a', attrs={ 'action-type': action_type_re("like") }).find_all('em')[1].text likes = likes.strip('(').strip(')').replace(',', '') likes = int(likes) if likes and unicode.isdigit(likes) else 0 mblog.n_likes = likes forwards = func_div.find('a', attrs={ 'action-type': action_type_re("forward") }).find_all('em')[1].text forwards = forwards.strip('(').strip(')').replace(',', '') mblog.n_forwards = int( forwards) if forwards and unicode.isdigit(forwards) else 0 comments = func_div.find('a', attrs={ 'action-type': action_type_re('comment') }).find_all('em')[1].text comments = comments.strip('(').strip(')').replace(',', '') mblog.n_comments = int( comments) if comments and unicode.isdigit(comments) else 0 # fetch geo info map_info = div.find("div", attrs={'class': 'map_data'}) if map_info is not None: geo = Geo() geo.location = map_info.text.split('-')[0].strip() geo_info = urldecode("?" + map_info.find('a')['action-data'])['geo'] geo.longtitude, geo.latitude = tuple( [float(itm) for itm in geo_info.split(',', 1)]) mblog.geo = geo # has_video div_video = div.find('div', attrs={ 'node-type': 'fl_h5_video_disp' }) or div.find('span', attrs={'class': 'icon_playvideo'}) mblog.has_video = True if div_video else False mblog.save() return mblog
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = None jsn = None try: br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) jsn = json.loads(br.response().read()) except (ValueError, URLError) as e: return self._error(url, e) soup = beautiful_soup(jsn['data']['html']) current_page = jsn['data']['page']['pagenum'] n_pages = jsn['data']['page']['totalpage'] if not self.check(url, br): return [], [] decodes = urldecode(url) mid = decodes.get('id', decodes.get('mid')) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find('dt').find('img')['src'] date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text date = date.strip().strip('(').strip(')') instance.created = self.parse_datetime(date) for div in dl.find_all('div'): div.extract() for span in dl.find_all('span'): span.extract() instance.content = dl.text.strip() if url.startswith('http://weibo.com/aj/comment'): dls = soup.find_all('dl', mid=True) for dl in dls: comment = Comment(uid=self.uid) set_instance(comment, dl) mblog.comments.append(comment) elif url.startswith('http://weibo.com/aj/mblog/info'): dls = soup.find_all('dl', mid=True) for dl in dls: forward = Forward(uid=self.uid, mid=dl['mid']) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith('http://weibo.com/aj/like'): lis = soup.find_all('li', uid=True) for li in lis: like = Like(uid=li['uid']) like.avatar = li.find('img')['src'] mblog.likes.append(like) try: mblog.save() self.logger.debug('parse %s finish' % url) except ValidationError, e: return self._error(url, e)
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url br = self.opener.browse_open(url) try: jsn = json.loads(br.response().read()) except ValueError: raise FetchBannedError('fetch banned by weibo server') # self.logger.debug('load %s finish' % url) try: soup = beautiful_soup(jsn['data']['html']) current_page = jsn['data']['page']['pagenum'] n_pages = jsn['data']['page']['totalpage'] except KeyError: raise FetchBannedError('fetch banned by weibo server') if not self.check(url, br): return decodes = urldecode(url) mid = decodes.get('id', decodes.get('mid')) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find('dt').find('img')['src'] date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text date = date.strip().strip('(').strip(')') instance.created = self.parse_datetime(date) for div in dl.find_all('div'): div.extract() for span in dl.find_all('span'): span.extract() instance.content = dl.text.strip() counter_type = None if url.startswith('http://weibo.com/aj/comment'): counter_type = 'comment' dls = soup.find_all('dl', mid=True) for dl in dls: uid = dl.find('a', usercard=True)['usercard'].split("id=", 1)[1] comment = Comment(uid=uid) set_instance(comment, dl) mblog.comments.append(comment) elif url.startswith('http://weibo.com/aj/mblog/info'): counter_type = 'forward' dls = soup.find_all('dl', mid=True) for dl in dls: forward_again_a = dl.find('a', attrs={'action-type': re.compile("^(feed_list|fl)_forward$")}) uid = urldecode('?%s' % forward_again_a['action-data'])['uid'] forward = Forward(uid=uid, mid=dl['mid']) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith('http://weibo.com/aj/like'): counter_type = 'like' lis = soup.find_all('li', uid=True) for li in lis: like = Like(uid=li['uid']) like.avatar = li.find('img')['src'] mblog.likes.append(like) mblog.save() # self.logger.debug('parse %s finish' % url) # counter add one for the processed forward or comment or like list url if counter_type is not None: self.counter.inc('processed_%s_list_page' % counter_type, 1) if current_page >= n_pages: return params = urldecode(url) new_params = urldecode('?page=%s'%(current_page+1)) params.update(new_params) params['__rnd'] = int(time.time()*1000) next_page = '%s?%s' % (url.split('?')[0] , urllib.urlencode(params)) yield next_page
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url params = urldecode(url) try: br = self.opener.browse_open(url) except URLError: raise FetchBannedError() if not self.check(url, br): return weibo_user = self.get_weibo_user() params['_t'] = 0 params['__rnd'] = str(int(time.time() * 1000)) page = int(params.get('page', 1)) pre_page = int(params.get('pre_page', 0)) count = 15 if 'pagebar' not in params: params['pagebar'] = '0' pre_page += 1 elif params['pagebar'] == '0': params['pagebar'] = '1' elif params['pagebar'] == '1': del params['pagebar'] pre_page = page page += 1 count = 50 params['count'] = count params['page'] = page params['pre_page'] = pre_page try: data = json.loads(br.response().read())['data'] except (ValueError, KeyError): raise FetchBannedError('fetch banned by weibo server') soup = beautiful_soup(data) finished = False divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True) max_id = None for div in divs: mid = div['mid'] if len(mid) == 0: continue max_id = mid blog_create_date = parse( div.select('a.S_link2.WB_time')[0]['title']) # skip all following blogs if create date less than effective start date if (blog_create_date - effective_start_date).days < 0: self.logger.info( "%s: blog has sync up after %s" % (self.uid, effective_start_date.strftime("%Y%m%d"))) finished = True break if 'end_id' not in params: params['end_id'] = mid # skip #if weibo_user.newest_mids and not mid in weibo_user.newest_mids: # self.logger.info("%s: reach earliest blog %s" % (self.uid,mid)) # finished = True # break if len(self.bundle.newest_mids) < 3: self.bundle.newest_mids.append(mid) try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) content_div = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_content' }) for img in content_div.find_all("img", attrs={'type': 'face'}): img.replace_with(img['title']) mblog.content = content_div.text is_forward = div.get('isforward') if is_forward: # write origional user, msg mblog.omid = div['omid'] tbinfos = div['tbinfo'].split('&') mblog.ouid = tbinfos[0].split('=')[1] name_a = div.find('a', attrs={ 'class': 'WB_name', 'node-type': 'feed_list_originNick' }) text_a = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_reason' }) if name_a is not None and text_a is not None: mblog.forward = '%s: %s' % (name_a.text, text_a.text) mblog.created = blog_create_date mblog.last_update = datetime.now() func_div = div.find_all('div', 'WB_func')[-1] action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t) likes = func_div.find('a', attrs={ 'action-type': action_type_re("like") }).text likes = likes.strip('(').strip(')') likes = 0 if len(likes) == 0 else int(likes) mblog.n_likes = likes forwards = func_div.find('a', attrs={ 'action-type': action_type_re("forward") }).text if '(' not in forwards: mblog.n_forwards = 0 else: mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')')) comments = func_div.find('a', attrs={ 'action-type': action_type_re('comment') }).text if '(' not in comments: mblog.n_comments = 0 else: mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')')) # fetch geo info map_info = div.find("div", attrs={'class': 'map_data'}) if map_info is not None: geo = Geo() geo.location = map_info.text.split('-')[0].strip() geo_info = urldecode("?" + map_info.find('a')['action-data'])['geo'] geo.longtitude, geo.latitude = tuple( [float(itm) for itm in geo_info.split(',', 1)]) mblog.geo = geo # has_video div_video = div.find( 'div', attrs={'node-type': 'fl_h5_video_disp'}) or div.find( 'span', attrs={'class': 'icon_playvideo'}) mblog.has_video = True if div_video else False mblog.save() self.counter.inc('processed_weibo_posts', 1) # fetch forwards and comments if self.uid in starts: query = {'id': mid, '_t': 0, '__rnd': int(time.time() * 1000)} query_str = urllib.urlencode(query) if fetch_forward and mblog.n_forwards > 0: forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str yield forward_url if fetch_comment and mblog.n_comments > 0: comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str yield comment_url if fetch_like and mblog.n_likes > 0: query = { 'mid': mid, '_t': 0, '__rnd': int(time.time() * 1000) } query_str = urllib.urlencode(query) like_url = 'http://weibo.com/aj/like/big?%s' % query_str yield like_url yield '%s?%s' % (url.split('?')[0], urllib.urlencode(params)) if params.has_key('pagebar'): params['max_id'] = max_id elif params.has_key('max_id'): del params['max_id'] # self.logger.debug('parse %s finish' % url) # counter add one for the processed weibo list url self.counter.inc('processed_weibo_list_page', 1) # if not has next page if len(divs) == 0 or finished: weibo_user = self.get_weibo_user() for mid in self.bundle.newest_mids: if mid not in weibo_user.newest_mids: weibo_user.newest_mids.append(mid) while len(weibo_user.newest_mids) > 3: weibo_user.newest_mids.pop() weibo_user.last_update = self.bundle.last_update weibo_user.save() return
def parse(self, url=None): if self.bundle.exists == False: return url = url or self.url params = urldecode(url) br = self.opener.browse_open(url) if not self.check(url, br): return weibo_user = self.get_weibo_user() params['_t'] = 0 params['__rnd'] = str(int(time.time() * 1000)) page = int(params.get('page', 1)) pre_page = params.get('pre_page', 1) if 'pagebar' not in params: params['pagebar'] = '0' elif params['pagebar'] == '0': params['pagebar'] = '1' elif params['pagebar'] == '1': del params['pagebar'] pre_page = page page += 1 count = 15 params['count'] = count params['page'] = page params['pre_page'] = pre_page data = json.loads(br.response().read())['data'] soup = BeautifulSoup(data) divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True) max_id = None for div in divs: mid = div['mid'] if len(mid) == 0: continue max_id = mid if 'end_id' not in params: params['end_id'] = mid if weibo_user.newest_mid is not None and \ weibo_user.newest_mid == mid: break mblog = MicroBlog(mid=mid) mblog.content = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_content' }).text is_forward = div.get('isforward') == '1' if is_forward: name_a = div.find('a', attrs={ 'class': 'WB_name', 'node-type': 'feed_list_originNick' }) text_a = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_reason' }) if name_a is not None and text_a is not None: mblog.forward = '%s: %s' % ( name_a.text, text_a.text ) mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title']) likes = div.find('a', attrs={'action-type': 'feed_list_like'}).text likes = likes.strip('(').strip(')') likes = 0 if len(likes) == 0 else int(likes) mblog.likes = likes forwards = div.find('a', attrs={'action-type': 'feed_list_forward'}).text if '(' not in forwards: mblog.forwards = 0 else: mblog.forwards = int(forwards.strip().split('(', 1)[1].strip(')')) comments = div.find('a', attrs={'action-type': 'feed_list_comment'}).text if '(' not in comments: mblog.comments = 0 else: mblog.comments = int(comments.strip().split('(', 1)[1].strip(')')) weibo_user.statuses.append(mblog) params['max_id'] = max_id # if not has next page if len(divs) < count: weibo_user.newest_mid = params['end_id'] weibo_user.save() return [], [] weibo_user.save() return ['%s?%s'%(url.split('?')[0], urllib.urlencode(params)), ], []
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url params = urldecode(url) br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() params['_t'] = 0 params['__rnd'] = str(int(time.time() * 1000)) page = int(params.get('page', 1)) pre_page = int(params.get('pre_page', 0)) count = 15 if 'pagebar' not in params: params['pagebar'] = '0' pre_page += 1 elif params['pagebar'] == '0': params['pagebar'] = '1' elif params['pagebar'] == '1': del params['pagebar'] pre_page = page page += 1 count = 50 params['count'] = count params['page'] = page params['pre_page'] = pre_page data = json.loads(br.response().read())['data'] soup = beautiful_soup(data) finished = False divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True) max_id = None next_urls = [] for div in divs: mid = div['mid'] if len(mid) == 0: continue max_id = mid if 'end_id' not in params: params['end_id'] = mid if mid in weibo_user.newest_mids: finished = True break if len(self.bundle.newest_mids) < 3: self.bundle.newest_mids.append(mid) try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) content_div = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_content' }) # Links for content_a in content_div.find_all( 'a', attrs={'action-type': 'feed_list_url'}): href = content_a['href'] if href not in mblog.links: mblog.links.append(href) # tags tags_div = content_div.find('div', attrs={'class': 'wTablist2'}) if tags_div is not None: for tag_a in tags_div.find_all('a'): tag = tag_a.text.strip() if len(tag) > 0 and tag not in mblog.tags: mblog.tags.append(tag) is_forward = div.get('isforward') == '1' if is_forward: mblog.omid = div['omid'] mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title']) if self.bundle.last_update is None or mblog.created > self.bundle.last_update: self.bundle.last_update = mblog.created if weibo_user.last_update is not None and \ mblog.created <= weibo_user.last_update: finished = True break likes = div.find('a', attrs={'action-type': 'feed_list_like'}).text likes = likes.strip('(').strip(')') likes = 0 if len(likes) == 0 else int(likes) mblog.n_likes = likes forwards = div.find('a', attrs={ 'action-type': 'feed_list_forward' }).text if '(' not in forwards: mblog.n_forwards = 0 else: mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')')) comments = div.find('a', attrs={ 'action-type': 'feed_list_comment' }).text if '(' not in comments: mblog.n_comments = 0 else: mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')')) # fetch forwards and comments if fetch_forward or fetch_comment or fetch_like: query = {'id': mid, '_t': 0, '__rnd': int(time.time() * 1000)} query_str = urllib.urlencode(query) if fetch_forward and mblog.n_forwards > 0: forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str next_urls.append(forward_url) if fetch_comment and mblog.n_comments > 0: comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str next_urls.append(comment_url) if fetch_like and mblog.n_likes > 0: query = { 'mid': mid, '_t': 0, '__rnd': int(time.time() * 1000) } query_str = urllib.urlencode(query) like_url = 'http://weibo.com/aj/like/big?%s' % query_str next_urls.append(like_url) mblog.save() if 'pagebar' in params: params['max_id'] = max_id else: del params['max_id'] self.logger.debug('parse %s finish' % url) # if not has next page if len(divs) == 0 or finished: weibo_user = self.get_weibo_user() for mid in self.bundle.newest_mids: if mid not in weibo_user.newest_mids: weibo_user.newest_mids.append(mid) while len(weibo_user.newest_mids) > 3: weibo_user.newest_mids.pop() weibo_user.last_update = self.bundle.last_update weibo_user.save() return [], [] next_urls.append('%s?%s' % (url.split('?')[0], urllib.urlencode(params))) return next_urls, []
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url params = urldecode(url) br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() params['_t'] = 0 params['__rnd'] = str(int(time.time() * 1000)) page = int(params.get('page', 1)) pre_page = int(params.get('pre_page', 0)) count = 15 if 'pagebar' not in params: params['pagebar'] = '0' pre_page += 1 elif params['pagebar'] == '0': params['pagebar'] = '1' elif params['pagebar'] == '1': del params['pagebar'] pre_page = page page += 1 count = 50 params['count'] = count params['page'] = page params['pre_page'] = pre_page data = json.loads(br.response().read())['data'] soup = beautiful_soup(data) finished = False divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True) max_id = None next_urls = [] for div in divs: mid = div['mid'] if len(mid) == 0: continue max_id = mid if 'end_id' not in params: params['end_id'] = mid if mid in weibo_user.newest_mids: finished = True break if len(self.bundle.newest_mids) < 3: self.bundle.newest_mids.append(mid) try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) content_div = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_content' }) mblog.content = content_div.text # Links for content_a in content_div.find_all('a', attrs={'action-type': 'feed_list_url'}): href = content_a['href'] if href not in mblog.links: mblog.links.append(href) # tags tags_div = content_div.find('div', attrs={'class': 'wTablist2'}) if tags_div is not None: for tag_a in tags_div.find_all('a'): tag = tag_a.text.strip() if len(tag) > 0 and tag not in mblog.tags: mblog.tags.append(tag) is_forward = div.get('isforward') == '1' if is_forward: mblog.omid = div['omid'] mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title']) if self.bundle.last_update is None or mblog.created > self.bundle.last_update: self.bundle.last_update = mblog.created if weibo_user.last_update is not None and \ mblog.created <= weibo_user.last_update: finished = True break likes = div.find('a', attrs={'action-type': 'feed_list_like'}).text likes = likes.strip('(').strip(')') likes = 0 if len(likes) == 0 else int(likes) mblog.n_likes = likes forwards = div.find('a', attrs={'action-type': 'feed_list_forward'}).text if '(' not in forwards: mblog.n_forwards = 0 else: mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')')) comments = div.find('a', attrs={'action-type': 'feed_list_comment'}).text if '(' not in comments: mblog.n_comments = 0 else: mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')')) # fetch forwards and comments if fetch_forward or fetch_comment or fetch_like: query = {'id': mid, '_t': 0, '__rnd': int(time.time()*1000)} query_str = urllib.urlencode(query) if fetch_forward and mblog.n_forwards > 0: forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str next_urls.append(forward_url) if fetch_comment and mblog.n_comments > 0: comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str next_urls.append(comment_url) if fetch_like and mblog.n_likes > 0: query = {'mid': mid, '_t': 0, '__rnd': int(time.time()*1000)} query_str = urllib.urlencode(query) like_url = 'http://weibo.com/aj/like/big?%s' % query_str next_urls.append(like_url) mblog.save() if 'pagebar' in params: params['max_id'] = max_id else: del params['max_id'] self.logger.debug('parse %s finish' % url) # if not has next page if len(divs) == 0 or finished: weibo_user = self.get_weibo_user() for mid in self.bundle.newest_mids: if mid not in weibo_user.newest_mids: weibo_user.newest_mids.append(mid) while len(weibo_user.newest_mids) > 3: weibo_user.newest_mids.pop() weibo_user.last_update = self.bundle.last_update weibo_user.save() return [], [] next_urls.append('%s?%s'%(url.split('?')[0], urllib.urlencode(params))) return next_urls, []
def parse(self, url=None): if self.bundle.exists == False: return url = url or self.url params = urldecode(url) br = self.opener.browse_open(url) if not self.check(url, br): return weibo_user = self.get_weibo_user() statuses = weibo_user.statuses params['_t'] = 0 params['__rnd'] = str(int(time.time() * 1000)) page = int(params.get('page', 1)) pre_page = params.get('pre_page', 1) if 'pagebar' not in params: params['pagebar'] = '0' elif params['pagebar'] == '0': params['pagebar'] = '1' elif params['pagebar'] == '1': del params['pagebar'] pre_page = page page += 1 count = 15 params['count'] = count params['page'] = page params['pre_page'] = pre_page data = json.loads(br.response().read())['data'] soup = BeautifulSoup(data) divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True) max_id = None for div in divs: mid = div['mid'] if len(mid) == 0: continue max_id = mid if 'end_id' not in params: params['end_id'] = mid if weibo_user.newest_mid is not None and \ weibo_user.newest_mid == mid: break mblog = MicroBlog(mid=mid) mblog.content = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_content' }).text is_forward = div.get('isforward') == '1' if is_forward: mblog.forward = '%s: %s' % ( div.find('a', attrs={ 'class': 'WB_name', 'node-type': 'feed_list_originNick' }).text, div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_reason' }).text) mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title']) likes = div.find('a', attrs={'action-type': 'feed_list_like'}).text likes = likes.strip('(').strip(')') likes = 0 if len(likes) == 0 else int(likes) mblog.likes = likes forwards = div.find('a', attrs={ 'action-type': 'feed_list_forward' }).text if '(' not in forwards: mblog.forwards = 0 else: mblog.forwards = int(forwards.strip().split('(', 1)[1].strip(')')) comments = div.find('a', attrs={ 'action-type': 'feed_list_comment' }).text if '(' not in comments: mblog.comments = 0 else: mblog.comments = int(comments.strip().split('(', 1)[1].strip(')')) statuses.append(mblog) params['max_id'] = max_id # if not has next page if len(divs) < count: weibo_user.newest_mid = params['end_id'] weibo_user.save() return [], [] weibo_user.save() return [ '%s?%s' % (url.split('?')[0], urllib.urlencode(params)), ], []
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url br = self.opener.browse_open(url) jsn = json.loads(br.response().read()) # self.logger.debug('load %s finish' % url) soup = beautiful_soup(jsn["data"]["html"]) current_page = jsn["data"]["page"]["pagenum"] n_pages = jsn["data"]["page"]["totalpage"] if not self.check(url, br): return decodes = urldecode(url) mid = decodes.get("id", decodes.get("mid")) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, "objects").get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find("dt").find("img")["src"] date = dl.find("dd").find(attrs={"class": "S_txt2"}).text date = date.strip().strip("(").strip(")") instance.created = self.parse_datetime(date) for div in dl.find_all("div"): div.extract() for span in dl.find_all("span"): span.extract() instance.content = dl.text.strip() counter_type = None if url.startswith("http://weibo.com/aj/comment"): counter_type = "comment" dls = soup.find_all("dl", mid=True) for dl in dls: uid = dl.find("a", usercard=True)["usercard"].split("id=", 1)[1] comment = Comment(uid=uid) set_instance(comment, dl) mblog.comments.append(comment) elif url.startswith("http://weibo.com/aj/mblog/info"): counter_type = "forward" dls = soup.find_all("dl", mid=True) for dl in dls: forward_again_a = dl.find("a", attrs={"action-type": re.compile("^(feed_list|fl)_forward$")}) uid = urldecode("?%s" % forward_again_a["action-data"])["uid"] forward = Forward(uid=uid, mid=dl["mid"]) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith("http://weibo.com/aj/like"): counter_type = "like" lis = soup.find_all("li", uid=True) for li in lis: like = Like(uid=li["uid"]) like.avatar = li.find("img")["src"] mblog.likes.append(like) mblog.save() # self.logger.debug('parse %s finish' % url) # counter add one for the processed forward or comment or like list url if counter_type is not None: self.counter.inc("processed_%s_list_page" % counter_type, 1) if current_page >= n_pages: return params = urldecode(url) new_params = urldecode("?page=%s" % (current_page + 1)) params.update(new_params) params["__rnd"] = int(time.time() * 1000) next_page = "%s?%s" % (url.split("?")[0], urllib.urlencode(params)) yield next_page
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url try: br = self.opener.browse_open(url) except Exception as e: print(e) print('休息10分钟!') time.sleep(60 * 10) try: jsn = json.loads(br.response().read()) except ValueError: print('休息10分钟!') time.sleep(60 * 10) raise FetchBannedError('fetch banned by weibo server') # self.logger.debug('load %s finish' % url) try: soup = beautiful_soup(jsn['data']['html']) current_page = jsn['data']['page']['pagenum'] n_pages = jsn['data']['page']['totalpage'] except KeyError: print('休息10分钟!') time.sleep(60 * 10) raise FetchBannedError('fetch banned by weibo server') if not self.check(url, br): return decodes = urldecode(url) mid = decodes.get('id', decodes.get('mid')) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find('dt').find('img')['src'] date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text date = date.strip().strip('(').strip(')') instance.created = self.parse_datetime(date) for div in dl.find_all('div'): div.extract() for span in dl.find_all('span'): span.extract() instance.content = dl.text.strip() counter_type = None #print(u'微博:'+mblog.content+u'的评论') if url.startswith('http://weibo.com/aj/comment'): counter_type = 'comment' dls = soup.find_all('dl', mid=True) for dl in dls: uid = dl.find('a', usercard=True)['usercard'].split("id=", 1)[1] comment = Comment(uid=uid) set_instance(comment, dl) #print(u'微博评论:'+comment.content) mblog.comments.append(comment) elif url.startswith('http://weibo.com/aj/mblog/info'): counter_type = 'forward' dls = soup.find_all('dl', mid=True) for dl in dls: forward_again_a = dl.find( 'a', attrs={ 'action-type': re.compile("^(feed_list|fl)_forward$") }) uid = urldecode('?%s' % forward_again_a['action-data'])['uid'] forward = Forward(uid=uid, mid=dl['mid']) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith('http://weibo.com/aj/like'): counter_type = 'like' lis = soup.find_all('li', uid=True) for li in lis: like = Like(uid=li['uid']) like.avatar = li.find('img')['src'] mblog.likes.append(like) mblog.save() # self.logger.debug('parse %s finish' % url) # counter add one for the processed forward or comment or like list url if counter_type is not None: self.counter.inc('processed_%s_list_page' % counter_type, 1) if current_page >= n_pages: return params = urldecode(url) new_params = urldecode('?page=%s' % (current_page + 1)) params.update(new_params) params['__rnd'] = int(time.time() * 1000) next_page = '%s?%s' % (url.split('?')[0], urllib.urlencode(params)) yield next_page
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url params = urldecode(url) br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() params['_t'] = 0 params['__rnd'] = str(int(time.time() * 1000)) page = int(params.get('page', 1)) pre_page = int(params.get('pre_page', 0)) count = 15 if 'pagebar' not in params: params['pagebar'] = '0' pre_page += 1 elif params['pagebar'] == '0': params['pagebar'] = '1' elif params['pagebar'] == '1': del params['pagebar'] pre_page = page page += 1 count = 50 params['count'] = count params['page'] = page params['pre_page'] = pre_page data = json.loads(br.response().read())['data'] soup = beautiful_soup(data) finished = False divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True) max_id = None next_urls = [] for div in divs: mid = div['mid'] if len(mid) == 0: continue max_id = mid if 'end_id' not in params: params['end_id'] = mid if mid in weibo_user.newest_mids: finished = True break if len(self.bundle.newest_mids) < 3: self.bundle.newest_mids.append(mid) try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) content_div = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_content' }) for img in content_div.find_all("img", attrs={'type': 'face'}): img.replace_with(img['title']); mblog.content = content_div.text is_forward = div.get('isforward') == '1' if is_forward: mblog.omid = div['omid'] name_a = div.find('a', attrs={ 'class': 'WB_name', 'node-type': 'feed_list_originNick' }) text_a = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_reason' }) if name_a is not None and text_a is not None: mblog.forward = '%s: %s' % ( name_a.text, text_a.text ) #mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title']) #ci # temp = parse(div.select('a.S_link2.WB_time')[0]['title']) tempstring = temp.strftime("%Y-%m-%d-%H-%M-%S") list=tempstring.split('-') tempyear=list[0] tempmonth=list[1] tempday=list[2] temphour=list[3] tempmin=list[4] tempsec=list[5] temptime=time.mktime(datetime(int(tempyear),int(tempmonth),int(tempday),int(temphour),int(tempmin),int(tempsec)).timetuple()) print temptime timevalue=open("D:\\09Limited_buffer\\earlywarningbyci\\cola\\contrib\\weibo\\timevalue.txt","r") time_re=timevalue.readline() timevalue.close() list=time_re.split() starttime=list[0] endtime=list[1] print starttime temptime=round(float(temptime)) starttime=round(float(starttime)) endtime=round(float(endtime)) if temptime>=starttime and temptime<=endtime: mblog.created = temp #timeok = True print "------OKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOK-----" else: if temptime<starttime: print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" time.sleep(5) return [], [] #continue # # if self.bundle.last_update is None or mblog.created > self.bundle.last_update: self.bundle.last_update = mblog.created if weibo_user.last_update is not None and \ mblog.created <= weibo_user.last_update: finished = True break func_div = div.find_all('div', 'WB_func')[-1] action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t) likes = func_div.find('a', attrs={'action-type': action_type_re("like")}).text likes = likes.strip('(').strip(')') likes = 0 if len(likes) == 0 else int(likes) mblog.n_likes = likes forwards = func_div.find('a', attrs={'action-type': action_type_re("forward")}).text if '(' not in forwards: mblog.n_forwards = 0 else: mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')')) comments = func_div.find('a', attrs={'action-type': action_type_re('comment')}).text if '(' not in comments: mblog.n_comments = 0 else: mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')')) # fetch geo info map_info = div.find("div", attrs={'class': 'map_data'}) if map_info is not None: geo = Geo() geo.location = map_info.text.split('-')[0].strip() geo_info = urldecode("?"+map_info.find('a')['action-data'])['geo'] geo.longtitude, geo.latitude = tuple([float(itm) for itm in geo_info.split(',', 1)]) mblog.geo = geo # fetch forwards and comments if fetch_forward or fetch_comment or fetch_like: query = {'id': mid, '_t': 0, '__rnd': int(time.time()*1000)} query_str = urllib.urlencode(query) if fetch_forward and mblog.n_forwards > 0: forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str next_urls.append(forward_url) if fetch_comment and mblog.n_comments > 0: comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str next_urls.append(comment_url) if fetch_like and mblog.n_likes > 0: query = {'mid': mid, '_t': 0, '__rnd': int(time.time()*1000)} query_str = urllib.urlencode(query) like_url = 'http://weibo.com/aj/like/big?%s' % query_str next_urls.append(like_url) mblog.save() if 'pagebar' in params: params['max_id'] = max_id else: del params['max_id'] self.logger.debug('parse %s finish' % url) # if not has next page if len(divs) == 0 or finished: weibo_user = self.get_weibo_user() for mid in self.bundle.newest_mids: if mid not in weibo_user.newest_mids: weibo_user.newest_mids.append(mid) while len(weibo_user.newest_mids) > 3: weibo_user.newest_mids.pop() weibo_user.last_update = self.bundle.last_update weibo_user.save() return [], [] next_urls.append('%s?%s'%(url.split('?')[0], urllib.urlencode(params))) return next_urls, []
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url params = urldecode(url) try: br = self.opener.browse_open(url) except Exception as e: print(e) print('休息10分钟!') time.sleep(60 * 10) # self.logger.debug('load %s finish' % url) if not self.check(url, br): return weibo_user = self.get_weibo_user() params['_t'] = 0 params['__rnd'] = str(int(time.time() * 1000)) page = int(params.get('page', 1)) pre_page = int(params.get('pre_page', 0)) count = 15 if 'pagebar' not in params: params['pagebar'] = '0' pre_page += 1 elif params['pagebar'] == '0': params['pagebar'] = '1' elif params['pagebar'] == '1': del params['pagebar'] pre_page = page page += 1 count = 50 params['count'] = count params['page'] = page params['pre_page'] = pre_page try: data = json.loads(br.response().read())['data'] except Exception as e: print(e) print('休息10分钟!') time.sleep(60 * 10) # self.logger.debug('load %s finish' % url) soup = beautiful_soup(data) finished = False divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True) max_id = None for div in divs: mid = div['mid'] if len(mid) == 0: continue max_id = mid if 'end_id' not in params: params['end_id'] = mid if mid in weibo_user.newest_mids: finished = True break if len(self.bundle.newest_mids) < 3: self.bundle.newest_mids.append(mid) try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid) & Q(uid=self.uid)) continue #认为已经爬过了 except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) content_div = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_content' }) for img in content_div.find_all("img", attrs={'type': 'face'}): img.replace_with(img['title']) mblog.content = content_div.text #print(u'微博内容:'+mblog.content) is_forward = div.get('isforward') == '1' if is_forward: mblog.omid = div['omid'] name_a = div.find('a', attrs={ 'class': 'WB_name', 'node-type': 'feed_list_originNick' }) text_a = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_reason' }) if name_a is not None and text_a is not None: mblog.forward = '%s: %s' % (name_a.text, text_a.text) mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title']) if self.bundle.last_update is None or mblog.created > self.bundle.last_update: self.bundle.last_update = mblog.created if weibo_user.last_update is not None and \ mblog.created <= weibo_user.last_update: finished = True break func_div = div.find_all('div', 'WB_func')[-1] action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t) likes = func_div.find('a', attrs={ 'action-type': action_type_re("like") }).text likes = likes.strip('(').strip(')') likes = 0 if len(likes) == 0 else int(likes) mblog.n_likes = likes forwards = func_div.find('a', attrs={ 'action-type': action_type_re("forward") }).text if '(' not in forwards: mblog.n_forwards = 0 else: mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')')) comments = func_div.find('a', attrs={ 'action-type': action_type_re('comment') }).text if '(' not in comments: mblog.n_comments = 0 else: mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')')) # fetch geo info map_info = div.find("div", attrs={'class': 'map_data'}) if map_info is not None: geo = Geo() geo.location = map_info.text.split('-')[0].strip() geo_info = urldecode("?" + map_info.find('a')['action-data'])['geo'] geo.longtitude, geo.latitude = tuple( [float(itm) for itm in geo_info.split(',', 1)]) mblog.geo = geo # fetch forwards and comments if fetch_forward or fetch_comment or fetch_like: query = {'id': mid, '_t': 0, '__rnd': int(time.time() * 1000)} query_str = urllib.urlencode(query) if fetch_forward and mblog.n_forwards > 0: forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str yield forward_url if fetch_comment and mblog.n_comments > fetch_n_comments: #只抓取评论数多于规定条数的微博 comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str yield comment_url if fetch_like and mblog.n_likes > 0: query = { 'mid': mid, '_t': 0, '__rnd': int(time.time() * 1000) } query_str = urllib.urlencode(query) like_url = 'http://weibo.com/aj/like/big?%s' % query_str yield like_url mblog.save() if 'pagebar' in params: params['max_id'] = max_id else: del params['max_id'] # self.logger.debug('parse %s finish' % url) # counter add one for the processed weibo list url self.counter.inc('processed_weibo_list_page', 1) # if not has next page if len(divs) == 0 or finished: weibo_user = self.get_weibo_user() for mid in self.bundle.newest_mids: if mid not in weibo_user.newest_mids: weibo_user.newest_mids.append(mid) while len(weibo_user.newest_mids) > 3: weibo_user.newest_mids.pop() weibo_user.last_update = self.bundle.last_update weibo_user.save() return yield '%s?%s' % (url.split('?')[0], urllib.urlencode(params))
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = None jsn = None try: br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) jsn = json.loads(br.response().read()) except (ValueError, URLError) as e: return self._error(url, e) soup = beautiful_soup(jsn['data']['html']) current_page = jsn['data']['page']['pagenum'] n_pages = jsn['data']['page']['totalpage'] if not self.check(url, br): return [], [] decodes = urldecode(url) mid = decodes.get('id', decodes.get('mid')) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find('dt').find('img')['src'] date_source = dl.find('dd').find('span', attrs={'class': 'S_txt2'}) if date_source is not None: date = date_source.text else: date_source = dl.find('dd').find('span',attrs={'class':'fl'}).find('em',attrs={'class': 'S_txt2'}) date = date_source.text date = date.strip().strip('(').strip(')') instance.created = self.parse_datetime(date) for div in dl.find_all('div'): div.extract() for span in dl.find_all('span'): span.extract() instance.content = dl.text.strip() if url.startswith('http://weibo.com/aj/comment'): dls = soup.find_all('dl', mid=True) for dl in dls: if fetch_comment_limit > 0 and self.bundle.fetched_weibo_comment_num >= fetch_comment_limit: self.bundle.fetched_weibo_comment_num = 0; try: mblog.save() self.logger.debug('parse %s finish' % url) except ValidationError, e: return self._error(url, e) return [],[] link = dl.find('a',attrs={'action-type': 'replycomment'}) data = dict([l.split('=') for l in link['action-data'].split('&')]) if fetch_comment_limit > 0 and self.bundle.fetched_last_comment_id != data['mid']: self.bundle.fetched_weibo_comment_num = 0; comment = Comment(uid=data['ouid'], mid=data['mid']) set_instance(comment, dl) mblog.comments.append(comment) self.bundle.fetched_last_comment_id = data['mid'] self.bundle.fetched_weibo_comment_num = self.bundle.fetched_weibo_comment_num + 1;