def __get_one_block_content(self,url,uid,lastestmid): ps = self.get_target(url) if ps == -1: return -1 divs = ps.find_all('div', attrs={'class': 'WB_feed_type SW_fun '}) if len(divs) == 0: print "no more miniblog" return 0 for div in divs: retv_i = {} mid = div['mid'] max_id = mid if not mid or mid == 0: continue if lastestmid >= mid: return 0 # no new miniblog retv_i['uid'] = uid if self.end_id == 0: self.end_id = mid retv_i['latest_mid'] = self.end_id content_div = div.find('div', attrs={'class': 'WB_text', 'node-type': 'feed_list_content'}) imgs = div.find_all("img",attrs={"class":"bigcursor"}) mbcontent = htmlstripper.strip_tags(str(content_div)).decode('utf-8') mds = div.find_all("li",attrs={'action-type':'feed_list_media_vedio'}) mdcontent = [] for md in mds: data = md['action-data'] md_imgs = md.find_all('img') md_imgsrc = [] for mdimg in md_imgs: md_imgsrc.append(mdimg['src']) mdcontent.append(data) mdcontent.append(md_imgsrc) imgssrc = [] for img in imgs: imgssrc.append(img['src']) time_a = div.find("a",attrs={"class":"S_link2 WB_time","node-type":"feed_list_item_date"}) time_str = time_a['title'] retv_i['mid'] = mid retv_i['miniblog'] = mbcontent retv_i['imgs'] = imgssrc retv_i['timestamp'] = time_str retv_i['media_info'] = mdcontent if self.handler != None: self.handler([retv_i]) if len(divs) < BLOG_NUM_PER_BLOCK: max_id = 0 return max_id
def get_uids(self,ps): userbox = ps.find("ul",attrs={"class":"cnfList","node-type":"userListBox"}) #usrlis = userbox.find_all("li",attrs={"class":"clearfix S_line1","action":"itemClick"}) #to be more precise usrlis = ps.find_all("div",attrs={"class":"con_left"}) retlist = [] for user in usrlis: retv = {} a1 = user.find("a",attrs={"class":"W_f14 S_func1"}) userid = a1['usercard'][3:] userhref = a1['href'] usernick = htmlstripper.strip_tags(str(a1)).decode('utf-8') #a2 = user.find("i",attrs={"class":re.compile(ur"W_ico16 approve")}) #fix to use regex here #approve #approve_co #regex does not work??? usertype = "" a2 = user.find("i",attrs={"class":"W_ico16 approve"}) if not a2: a2 = user.find("i",attrs={"class":"W_ico16 approve_co"}) if a2: usertype = a2['title'] a3 = user.find("i",attrs={"class":"W_ico16 member"}) ismember = 0 if a3: ismember = 1 span1 = user.find("span",attrs={"class":"addr"}) useraddr = htmlstripper.strip_tags(str(span1)).decode('utf-8') # fl_href = "/"+userid+"/follow" fs_href = "/"+userid+"/fans" #wb_href = userhref connect1 = user.find("div",attrs={"class":"connect"}) a4 = connect1.find("a",attrs={"href":fl_href}) fl_num = a4.string a5 = connect1.find("a",attrs={"href":fs_href}) fs_num = a5.string a6 = connect1.find("a",attrs={"href":userhref}) wb_num = a6.string info = user.find("div",attrs={"class":"info"}) infotxt = "" if info: infotxt = info.string print "need photo" print "id: "+userid + ", nick: "+usernick+", href: "+userhref print "follower num: "+fl_num + ", fans num: "+fs_num+", weibo num: "+wb_num print "user addr: "+useraddr+" usertype: "+usertype print "info: "+infotxt retv['uid'] = userid retv['nick'] = usernick retv['href'] = userhref retv['follower_num'] = fl_num retv['fans_num'] = fs_num retv['miniblog_num'] = wb_num retv['address'] = useraddr retv['usertype'] = usertype retv['info'] = infotxt if self.handler != None: self.handler([retv]) self.index = self.index+1 print "----------------------------------"+str(self.index)
def get_uids(self, ps): userbox = ps.find("ul", attrs={ "class": "cnfList", "node-type": "userListBox" }) #usrlis = userbox.find_all("li",attrs={"class":"clearfix S_line1","action":"itemClick"}) #to be more precise usrlis = ps.find_all("div", attrs={"class": "con_left"}) retlist = [] for user in usrlis: retv = {} a1 = user.find("a", attrs={"class": "W_f14 S_func1"}) userid = a1['usercard'][3:] userhref = a1['href'] usernick = htmlstripper.strip_tags(str(a1)).decode('utf-8') #a2 = user.find("i",attrs={"class":re.compile(ur"W_ico16 approve")}) #fix to use regex here #approve #approve_co #regex does not work??? usertype = "" a2 = user.find("i", attrs={"class": "W_ico16 approve"}) if not a2: a2 = user.find("i", attrs={"class": "W_ico16 approve_co"}) if a2: usertype = a2['title'] a3 = user.find("i", attrs={"class": "W_ico16 member"}) ismember = 0 if a3: ismember = 1 span1 = user.find("span", attrs={"class": "addr"}) useraddr = htmlstripper.strip_tags(str(span1)).decode('utf-8') # fl_href = "/" + userid + "/follow" fs_href = "/" + userid + "/fans" #wb_href = userhref connect1 = user.find("div", attrs={"class": "connect"}) a4 = connect1.find("a", attrs={"href": fl_href}) fl_num = a4.string a5 = connect1.find("a", attrs={"href": fs_href}) fs_num = a5.string a6 = connect1.find("a", attrs={"href": userhref}) wb_num = a6.string info = user.find("div", attrs={"class": "info"}) infotxt = "" if info: infotxt = info.string print "need photo" print "id: " + userid + ", nick: " + usernick + ", href: " + userhref print "follower num: " + fl_num + ", fans num: " + fs_num + ", weibo num: " + wb_num print "user addr: " + useraddr + " usertype: " + usertype print "info: " + infotxt retv['uid'] = userid retv['nick'] = usernick retv['href'] = userhref retv['follower_num'] = fl_num retv['fans_num'] = fs_num retv['miniblog_num'] = wb_num retv['address'] = useraddr retv['usertype'] = usertype retv['info'] = infotxt if self.handler != None: self.handler([retv]) self.index = self.index + 1 print "----------------------------------" + str(self.index)
def __get_one_block_content(self, url, uid, lastestmid): ps = self.get_target(url) if ps == -1: return -1 divs = ps.find_all('div', attrs={'class': 'WB_feed_type SW_fun '}) if len(divs) == 0: print "no more miniblog" return 0 for div in divs: retv_i = {} mid = div['mid'] max_id = mid if not mid or mid == 0: continue if lastestmid >= mid: return 0 # no new miniblog retv_i['uid'] = uid if self.end_id == 0: self.end_id = mid retv_i['latest_mid'] = self.end_id content_div = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_content' }) imgs = div.find_all("img", attrs={"class": "bigcursor"}) mbcontent = htmlstripper.strip_tags( str(content_div)).decode('utf-8') mds = div.find_all("li", attrs={'action-type': 'feed_list_media_vedio'}) mdconent = [] for md in mds: data = md['action-data'] md_imgs = md.find_all('img') md_imgsrc = [] for mdimg in md_imgs: md_imgsrc.append(mdimg['src']) mdcontent.append(data) mdcontent.append(md_imgsrc) imgssrc = [] for img in imgs: imgssrc.append(img['src']) time_a = div.find("a", attrs={ "class": "S_link2 WB_time", "node-type": "feed_list_item_date" }) time_str = time_a['title'] retv_i['mid'] = mid retv_i['miniblog'] = mbcontent retv_i['imgs'] = imgssrc retv_i['timestamp'] = time_str retv_i['media_info'] = mdconent if self.handler != None: self.handler([retv_i]) if len(divs) < BLOG_NUM_PER_BLOCK: max_id = 0 return max_id