def getDatetime(curtime, timeString): dateString = None curtime = datetime.datetime.strptime(curtime, "%Y-%m-%d %H:%M:%S") if timeString.find(u'秒前') > -1: temp_time = getMatch(timeString, '^\d+') dateString = curtime - datetime.timedelta(seconds=long(temp_time)) if timeString.find(u'分钟前') > -1: temp_time = getMatch(timeString, '\d+') dateString = curtime - datetime.timedelta(minutes=long(temp_time)) if timeString.find(u'小时前') > -1: temp_time = getMatch(timeString, '\d+') dateString = curtime - datetime.timedelta(hours=long(temp_time)) if timeString.find(u'天前') > -1: temp_time = getMatch(timeString, '\d+') dateString = curtime - datetime.timedelta(days=long(temp_time)) return dateString
def commentInbox(self): cr_url = 'http://weibo.com/comment/inbox?&page=1&pids=Pl_Content_Commentlist' json_list = [] tags = False comment_url = cr_url while True: print comment_url while True: try: request = urllib2.Request(comment_url, headers=self._headers) response = urllib2.urlopen(request, timeout=60) html = response.read().decode('string_escape').replace( '\\/', '/') print html break except Exception, e: print "Network Exception!!! ", e time.sleep(5) continue #finally: datas = getMatchList( html, '<div class="WB_feed_detail clearfix">(*)<!--/主评论-->') # print len(datas) for data in datas: photo_url = "http:" + getMatch(data, '<img.*?src="(*)"') uid = getMatch(data, 'usercard="id=(*)"') nickname = getMatch(data, 'page_frame" title="(*)"') mid = getMatch(data, '&cid=(*)&') timestamp = getMatch(data, '<div class="WB_from S_txt2">(*) 来自') if timestamp: timestamp = long(getTimeStamp(timestamp)) else: timestamp = 0 if timestamp <= self.mlasttime: tags = True break text = getMatch(data, '<div class="WB_text">(*)</div>') if text: text = extractForHTML(text) else: text = '' r_mid = getMatch(data, 'mid=(*)&') r_uid = self.uid #commet_type = 'make' commet_type = 'receive' _type = 'stranger' type1 = '' type2 = '' for fljson in self.follow: fjson = json.loads(fljson) if fjson['uid'] == uid: type1 = 'follow' break for fljson in self.fans: fjson = json.loads(fljson) if fjson['uid'] == uid: type2 = 'followed' break if type1 and type2: _type = 'friend' elif type1: _type = type1 elif type2: _type = type2 if uid == r_uid: _type = 'self' wb_item = { 'photo_url': photo_url, 'uid': uid, 'nick_name': nickname, 'mid': mid, 'timestamp': timestamp, 'text': text, 'root_mid': r_mid, 'root_uid': r_uid, 'weibo_type': _type, 'comment_type': commet_type, 'update_time': self.update_time } wb_json = json.dumps(wb_item) json_list.append(wb_json) # 分页 next_pageUrl = getUrlToPattern(html, comment_url, pattern='page', text_pattern='下一页') # print next_pageUrl if next_pageUrl: comment_url = next_pageUrl[0] elif not next_pageUrl or tags: break
def atMeMicroBlog(self): pre_page = 0 page = 1 pagebar = 0 # max_page = 100 at_MBurl = 'http://weibo.com/aj/at/mblog/list?ajwvr=6&pre_page=%s&page=%s' \ '&filter_by_author=0&filter_by_type=0&is_adv=0&pagebar=%s' print at_MBurl json_list = [] tags = False while True: wbUrl = at_MBurl % (pre_page, page, pagebar) print "current url: ", wbUrl while True: try: request = urllib2.Request(wbUrl, headers=self._headers) response = urllib2.urlopen(request, timeout=60) mb_content = json.loads(response.read()) break except Exception, e: print "Network Exception!!! ", e continue # finally: html = mb_content["data"] print "html****html****html****html****html****", html # 分页 print "html_replace***html_replace***html_replace***", html.replace( '\n', '').replace(' ', '') print len(html.replace('\n', '').replace(' ', '')) print tags if html.replace('\n', '').replace(' ', '') == '' or tags: break # if page > max_page: # break elif pre_page < page: pre_page += 1 elif pre_page == page and pagebar == 0: pagebar = 1 elif pagebar == 1: pre_page = page page += 1 pagebar = 0 datas = getMatchList( html, '<div class="WB_face W_fl">(*)<div node-type="feed_list_repeat' ) for data in datas: photo_url = "http:" + getMatch(data, '<img.*?src="(*)"') uid = getMatch(data, 'usercard="id=(*)&') nickname = getMatch(data, 'nick-name="(*)"') mid = getMatch(data, 'pubuser_nick:(*)"') timestamp = getMatch( data, '<div class="WB_from S_txt2">.*?date="(*)"')[0:-3] if timestamp and timestamp.isdigit(): timestamp = long(timestamp) else: timestamp = 0 if timestamp <= self.lasttime: tags = True break text = getMatch(data, 'feed_list_content" >(*)</div>').strip() if text: text = extractForHTML(text.strip()) else: text = '' retweet = getMatch( data, 'forward_btn_text">.*?<em>(*)</em>').replace('转发', '') if retweet and retweet.isdigit(): retweet = long(retweet) else: retweet = 0 comment = getMatch( data, 'comment_btn_text">.*?<em>(*)</em>').replace('评论', '') if comment and comment.isdigit(): comment = long(comment) else: comment = 0 like = getMatch(data, 'UI_ani_praised".*?<em>(*)</em>') if like and like.isdigit(): like = long(like) else: like = 0 r_mid = getMatch(data, 'rootmid=(*)&') r_uid = self.uid _type = 'stranger' type1 = '' type2 = '' for fljson in self.follow: fjson = json.loads(fljson) if fjson['uid'] == uid: type1 = 'follow' break for fljson in self.fans: fjson = json.loads(fljson) if fjson['uid'] == uid: type2 = 'followed' break if type1 and type2: _type = 'friend' elif type1: _type = type1 elif type2: _type = type2 if uid == r_uid: _type = 'self' wb_item = { 'photo_url': photo_url, 'uid': uid, 'nick_name': nickname, 'mid': mid, 'timestamp': timestamp, 'text': text, 'retweet': retweet, 'comment': comment, 'like': like, 'root_mid': r_mid, 'root_uid': r_uid, 'weibo_type': _type, 'update_time': self.update_time } wb_json = json.dumps(wb_item) json_list.append(wb_json)
print comment_url try: request = urllib2.Request(comment_url, headers=self._headers) response = urllib2.urlopen(request, timeout=60) html = response.read().decode('string_escape').replace( '\\/', '/') except Exception, e: print "Network Exception!!! ", e finally: datas = getMatchList( html, '<div class="WB_feed_detail clearfix">(*)<!--/主评论-->') # print len(datas) for data in datas: photo_url = "http:" + getMatch(data, '<img.*?src="(*)"') uid = getMatch(data, 'usercard="id=(*)"') nickname = getMatch(data, 'usercard=.*?alt="(*)"') mid = getMatch(data, '&cid=(*)&') timestamp = getMatch( data, '<div class="WB_from S_txt2">(*) 来自') if timestamp: timestamp = long(getTimeStamp(timestamp)) else: timestamp = 0 if timestamp <= self.lasttime: tags = True break text = getMatch(data,
next_pageUrl = getUrlToPattern(html, comment_url, pattern='page', text_pattern='下一页') print "next_pageUrl**next_pageUrl**next_pageUrl**next_pageUrl**", next_pageUrl if next_pageUrl: comment_url = next_pageUrl[0] else: break r_list_data = reversed(list_data) for l_datas in r_list_data: r_datas = reversed(l_datas) for data in r_datas: #print 'data::',data photo_url = getMatch(data, 'profile_image_url=(*)&') uid = getMatch(data, 'usercard="id=(*)"') nickname = getMatch(data, '<img.*?alt="(*)"') timestamp = int(round(time.time())) time.sleep(1) sex = getMatch(data, '&sex=(*)"') if not sex: sex = '' elif sex == 'f': sex = 'female' elif sex == 'm': sex = 'male' follow_source = getMatch(data, 'class="S_link2" >(*)</a>') if not follow_source:
request = urllib2.Request(comment_url, headers=self._headers) response = urllib2.urlopen(request, timeout=60) html = response.read().decode('string_escape').replace( '\\/', '/') except Exception, e: print "Network Exception!!! ", e #print 'html:', html finally: datas = getMatchList( html, '<div class="private_list SW_fun_bg S_line2 clearfix".*?<!-- 下拉列表 -->' ) for data in datas: photo_url = "http:" + getMatch(data, '<img.*?src="(*)"') uid = getMatch(data, 'usercard="id=(*)"') nickname = getMatch(data, '<img.*?alt="(*)"') r_uid = self.uid counts = getMatch( data, '<em class="W_new_count S_spetxt_bg">(*)</em>') if counts and counts.isdigit(): counts = long(counts) else: counts = 0 _type = 'stranger' type1 = '' type2 = '' for fljson in self.follow: fjson = json.loads(fljson)
def messages(self): cr_url = 'http://weibo.com/messages?pids=Pl_Content_MessageList&page=1' de_url = 'http://weibo.com/aj/message/getbyid?ajwvr=6&count=50&uid=%s&_t=0&__rnd=%d' json_list = [] tags = False comment_url = cr_url while True: print comment_url while True: try: request = urllib2.Request(comment_url, headers=self._headers) response = urllib2.urlopen(request, timeout=60) html = response.read().decode('string_escape').replace( '\\/', '/') break except Exception, e: print "Network Exception!!! ", e continue #print 'html:', html #finally: datas = getMatchList( html, '<div class="private_list SW_fun_bg S_line2 clearfix".*?<!-- 下拉列表 -->' ) for data in datas: photo_url = "http:" + getMatch(data, '<img.*?src="(*)"') uid = getMatch(data, 'usercard="id=(*)"') nickname = getMatch(data, '<img.*?alt="(*)"') r_uid = self.uid counts = getMatch( data, '<em class="W_new_count S_spetxt_bg">(*)</em>') if counts and counts.isdigit(): counts = long(counts) else: counts = 0 _type = 'stranger' type1 = '' type2 = '' for fljson in self.follow: fjson = json.loads(fljson) if fjson['uid'] == uid: type1 = 'follow' break for fljson in self.fans: fjson = json.loads(fljson) if fjson['uid'] == uid: type2 = 'followed' break if type1 and type2: _type = 'friend' elif type1: _type = type1 elif type2: _type = type2 if uid == r_uid: _type = 'self' while True: try: detailUrl = de_url % (uid, int(time.time() * 1000)) #print 'detail_url:', detailUrl request = urllib2.Request(detailUrl) response = urllib2.urlopen(request, timeout=60) ms_content = json.loads(response.read()) break except Exception, e: print "Network Exception!!! ", e continue #else: html = ms_content["data"]["html"] ms_datas = getMatchList( html, u'(<!-- 单行文字-->|<div class="space">).*?<!--/附件信息-->') # print datas[0] last_time = 0 for ms_data in ms_datas: mid_uid = getMatch(ms_data, 'usercard="id=(*)"') mid = getMatch(ms_data, 'mid="(*)"') timestamp = getMatch( ms_data, 'prompt_font S_txt2 S_bg1">(*)</legend>') #soup = BeautifulSoup(ms_data) #timestamp_bs4 = soup.find_all('legend', class_=["prompt_font", "S_txt2", "S_bg1"]) if timestamp: timestamp = long(getTimeStamp(timestamp)) last_time = timestamp else: timestamp = last_time if timestamp < self.lasttime: tags = True print 'timestamp<lasttime, timestamp, lasttime:', timestamp, self.lasttime #break next text = getMatch(ms_data, u'<div class="cont">.*?<!--/附件信息-->') if text: text = extractForHTML(text) text = commentExtract(text) if mid_uid == uid: private_type = 'receive' elif mid_uid == r_uid: private_type = 'make' else: private_type = '' wb_item = { 'photo_url': photo_url, 'uid': uid, 'nick_name': nickname, 'mid': mid, 'timestamp': timestamp, 'text': text, 'root_uid': r_uid, 'weibo_type': _type, 'private_type': private_type, 'w_new_count': counts, 'update_time': self.update_time } wb_json = json.dumps(wb_item) #print 'wb_json:::',wb_json json_list.append(wb_json)
def follow(self): cr_url = 'http://weibo.com/p/100505%s/myfollow?t=1&pids=Pl_Official_RelationMyfollow__93' \ '&cfs=&Pl_Official_RelationMyfollow__93_page=1#Pl_Official_RelationMyfollow__93' json_list = [] comment_url = cr_url % self.uid list_data = [] while True: print "comment_url**comment_url**comment_url**comment_url**", comment_url while True: try: request = urllib2.Request(comment_url, headers=self._headers) print 1111111111111111 response = urllib2.urlopen(request, timeout=60) print 2222222222222222 html = response.read().decode('string_escape').replace('\\/', '/') print 3333333333333333 break except Exception, e: print "Network Exception!!! ", e continue #finally: datas = getMatchList(html, '<li class="member_li S_bg1".*?</li>') # print len(datas) # r_datas = datas.reverse() list_data.append(datas) # 分页 next_pageUrl = getUrlToPattern(html, comment_url, pattern='page', text_pattern='下一页') print "next_pageUrl**next_pageUrl**next_pageUrl**next_pageUrl**",next_pageUrl if next_pageUrl: comment_url = next_pageUrl[0] else: break r_list_data = reversed(list_data) for l_datas in r_list_data: r_datas = reversed(l_datas) for data in r_datas: #print 'data::',data photo_url = getMatch(data, 'profile_image_url=(*)&') uid = getMatch(data, 'usercard="id=(*)"') nickname = getMatch(data, '<img.*?alt="(*)"') timestamp = int(round(time.time())) time.sleep(1) sex = getMatch(data, '&sex=(*)"') if not sex: sex = '' elif sex == 'f': sex = 'female' elif sex == 'm': sex = 'male' follow_source = getMatch(data, 'class="S_link2" >(*)</a>') if not follow_source: follow_source = '' description = getMatch(data, 'W_autocut S_txt2">(*)</div>') if not description: description = '' gid = getMatch(data, '&gid=(*)&') if not gid: gid = '0' gname = getMatch(data, '&gname=(*)&') if not gname: gname = '' r_uid = self.uid _type = 'follow' #获得关注人的详细信息 #user = SinaOperateAPI().getUserShow(uid=uid) wb_item = { 'photo_url': photo_url, 'uid': uid, 'mid': uid, 'nick_name': nickname, 'timestamp': timestamp, 'sex': sex, 'description': description, 'follow_source': follow_source, 'gid': gid, 'gname': gname, 'root_uid': r_uid, 'weibo_type': _type, 'update_time': self.update_time } if wb_item['mid'] == None: wb_item['mid'] = '' print "follow, mid", wb_item['mid'] print "follow, root_uid", wb_item['root_uid'] wb_json = json.dumps(wb_item) # print wb_json json_list.append(wb_json)
break # if page > max_page: # break elif pre_page < page: pre_page += 1 elif pre_page == page and pagebar == 0: pagebar = 1 elif pagebar == 1: pre_page = page page += 1 pagebar = 0 datas = getMatchList(html, '<div class="WB_face W_fl">(*)<div node-type="feed_list_repeat') for data in datas: photo_url = "http:" + getMatch(data, '<img.*?src="(*)"') uid = getMatch(data, 'usercard="id=(*)&') nickname = getMatch(data, 'nick-name="(*)"') mid = getMatch(data, 'pubuser_nick:(*)"') timestamp = getMatch(data, '<div class="WB_from S_txt2">.*?date="(*)"')[0:-3] if timestamp and timestamp.isdigit(): timestamp = long(timestamp) else: timestamp = 0 if timestamp <= self.lasttime: tags = True break text = getMatch(data, 'feed_list_content" >(*)</div>').strip() if text: