Python strip_tags Examples, factory.supports.htmlstripper.strip_tags Python Examples

Example #1

0

Show file

File: weibominiblogs.py Project: jundong/theysay

	def __get_one_block_content(self,url,uid,lastestmid):
		ps = self.get_target(url)
		if ps == -1:
			return -1
		divs = ps.find_all('div', attrs={'class': 'WB_feed_type SW_fun '})
		if len(divs) == 0:
			print "no more miniblog"
			return 0
			
		for div in divs:
			retv_i = {}
			mid = div['mid']
			max_id = mid
			if not mid or mid == 0:
				continue
			if lastestmid >= mid:
				return 0 # no new miniblog

			retv_i['uid'] = uid
			if self.end_id == 0:
				self.end_id = mid
				retv_i['latest_mid'] = self.end_id
			content_div = div.find('div', attrs={'class': 'WB_text', 'node-type': 'feed_list_content'})
			imgs = div.find_all("img",attrs={"class":"bigcursor"})
			mbcontent =  htmlstripper.strip_tags(str(content_div)).decode('utf-8')

			mds = div.find_all("li",attrs={'action-type':'feed_list_media_vedio'})
			mdcontent = []
			for md in mds:
				data = md['action-data']
				md_imgs = md.find_all('img')
				md_imgsrc = []
				for mdimg in md_imgs:
					md_imgsrc.append(mdimg['src'])
				mdcontent.append(data)
				mdcontent.append(md_imgsrc)
			imgssrc = []
			for img in imgs:
				imgssrc.append(img['src'])
			time_a = div.find("a",attrs={"class":"S_link2 WB_time","node-type":"feed_list_item_date"})
			time_str = time_a['title']
			retv_i['mid'] = mid
			retv_i['miniblog'] = mbcontent
			retv_i['imgs'] = imgssrc
			retv_i['timestamp'] = time_str
			retv_i['media_info'] = mdcontent
			if self.handler != None:
				self.handler([retv_i])
		if len(divs) < BLOG_NUM_PER_BLOCK:
			max_id = 0
		return max_id

Example #2

0

Show file

File: weibofollowers.py Project: chu888chu888/crawler-theysay

	def get_uids(self,ps):		
		userbox = ps.find("ul",attrs={"class":"cnfList","node-type":"userListBox"})
		#usrlis = userbox.find_all("li",attrs={"class":"clearfix S_line1","action":"itemClick"})
		#to be more precise
		usrlis = ps.find_all("div",attrs={"class":"con_left"})
		retlist = []
		
		for user in usrlis:
			retv = {}
			a1 = user.find("a",attrs={"class":"W_f14 S_func1"})
			userid = a1['usercard'][3:]
			userhref = a1['href']
			usernick = htmlstripper.strip_tags(str(a1)).decode('utf-8')
			#a2 = user.find("i",attrs={"class":re.compile(ur"W_ico16 approve")}) #fix to use regex here		
			#approve
			#approve_co
			#regex does not work???
			usertype = ""
			a2 = user.find("i",attrs={"class":"W_ico16 approve"})
			if not a2:
				a2 = user.find("i",attrs={"class":"W_ico16 approve_co"})

			if a2:
				usertype = a2['title']

			a3 = user.find("i",attrs={"class":"W_ico16 member"})
			ismember = 0
			if a3:
				ismember = 1
			span1 = user.find("span",attrs={"class":"addr"})
			useraddr = htmlstripper.strip_tags(str(span1)).decode('utf-8')
			#
			fl_href = "/"+userid+"/follow"
			fs_href = "/"+userid+"/fans"
			#wb_href = userhref

			connect1 = user.find("div",attrs={"class":"connect"})
			a4 = connect1.find("a",attrs={"href":fl_href})
			fl_num = a4.string
			a5 = connect1.find("a",attrs={"href":fs_href})
			fs_num = a5.string
			a6 = connect1.find("a",attrs={"href":userhref})
			wb_num = a6.string
			info = user.find("div",attrs={"class":"info"})
			infotxt = ""
			if info:
				infotxt = info.string

			print "need photo"

			print "id: "+userid + ", nick: "+usernick+", href: "+userhref
			print "follower num: "+fl_num + ", fans num: "+fs_num+", weibo num: "+wb_num
			print "user addr: "+useraddr+" usertype: "+usertype
			print "info: "+infotxt
		
			retv['uid'] = userid
			retv['nick'] = usernick
			retv['href'] = userhref
			retv['follower_num'] = fl_num
			retv['fans_num'] = fs_num
			retv['miniblog_num'] = wb_num
			retv['address'] = useraddr
			retv['usertype'] = usertype
			retv['info'] = infotxt
			if self.handler != None:
				self.handler([retv])
			self.index = self.index+1
			print "----------------------------------"+str(self.index)

Example #3

0

Show file

    def get_uids(self, ps):
        userbox = ps.find("ul",
                          attrs={
                              "class": "cnfList",
                              "node-type": "userListBox"
                          })
        #usrlis = userbox.find_all("li",attrs={"class":"clearfix S_line1","action":"itemClick"})
        #to be more precise
        usrlis = ps.find_all("div", attrs={"class": "con_left"})
        retlist = []

        for user in usrlis:
            retv = {}
            a1 = user.find("a", attrs={"class": "W_f14 S_func1"})
            userid = a1['usercard'][3:]
            userhref = a1['href']
            usernick = htmlstripper.strip_tags(str(a1)).decode('utf-8')
            #a2 = user.find("i",attrs={"class":re.compile(ur"W_ico16 approve")}) #fix to use regex here
            #approve
            #approve_co
            #regex does not work???
            usertype = ""
            a2 = user.find("i", attrs={"class": "W_ico16 approve"})
            if not a2:
                a2 = user.find("i", attrs={"class": "W_ico16 approve_co"})

            if a2:
                usertype = a2['title']

            a3 = user.find("i", attrs={"class": "W_ico16 member"})
            ismember = 0
            if a3:
                ismember = 1
            span1 = user.find("span", attrs={"class": "addr"})
            useraddr = htmlstripper.strip_tags(str(span1)).decode('utf-8')
            #
            fl_href = "/" + userid + "/follow"
            fs_href = "/" + userid + "/fans"
            #wb_href = userhref

            connect1 = user.find("div", attrs={"class": "connect"})
            a4 = connect1.find("a", attrs={"href": fl_href})
            fl_num = a4.string
            a5 = connect1.find("a", attrs={"href": fs_href})
            fs_num = a5.string
            a6 = connect1.find("a", attrs={"href": userhref})
            wb_num = a6.string
            info = user.find("div", attrs={"class": "info"})
            infotxt = ""
            if info:
                infotxt = info.string

            print "need photo"

            print "id: " + userid + ", nick: " + usernick + ", href: " + userhref
            print "follower num: " + fl_num + ", fans num: " + fs_num + ", weibo num: " + wb_num
            print "user addr: " + useraddr + " usertype: " + usertype
            print "info: " + infotxt

            retv['uid'] = userid
            retv['nick'] = usernick
            retv['href'] = userhref
            retv['follower_num'] = fl_num
            retv['fans_num'] = fs_num
            retv['miniblog_num'] = wb_num
            retv['address'] = useraddr
            retv['usertype'] = usertype
            retv['info'] = infotxt
            if self.handler != None:
                self.handler([retv])
            self.index = self.index + 1
            print "----------------------------------" + str(self.index)

Example #4

0

Show file

File: weibominiblogs.py Project: jwang-share/theysay

    def __get_one_block_content(self, url, uid, lastestmid):
        ps = self.get_target(url)
        if ps == -1:
            return -1
        divs = ps.find_all('div', attrs={'class': 'WB_feed_type SW_fun '})
        if len(divs) == 0:
            print "no more miniblog"
            return 0

        for div in divs:
            retv_i = {}
            mid = div['mid']
            max_id = mid
            if not mid or mid == 0:
                continue
            if lastestmid >= mid:
                return 0  # no new miniblog

            retv_i['uid'] = uid
            if self.end_id == 0:
                self.end_id = mid
                retv_i['latest_mid'] = self.end_id
            content_div = div.find('div',
                                   attrs={
                                       'class': 'WB_text',
                                       'node-type': 'feed_list_content'
                                   })
            imgs = div.find_all("img", attrs={"class": "bigcursor"})
            mbcontent = htmlstripper.strip_tags(
                str(content_div)).decode('utf-8')

            mds = div.find_all("li",
                               attrs={'action-type': 'feed_list_media_vedio'})
            mdconent = []
            for md in mds:
                data = md['action-data']
                md_imgs = md.find_all('img')
                md_imgsrc = []
                for mdimg in md_imgs:
                    md_imgsrc.append(mdimg['src'])
                mdcontent.append(data)
                mdcontent.append(md_imgsrc)
            imgssrc = []
            for img in imgs:
                imgssrc.append(img['src'])
            time_a = div.find("a",
                              attrs={
                                  "class": "S_link2 WB_time",
                                  "node-type": "feed_list_item_date"
                              })
            time_str = time_a['title']
            retv_i['mid'] = mid
            retv_i['miniblog'] = mbcontent
            retv_i['imgs'] = imgssrc
            retv_i['timestamp'] = time_str
            retv_i['media_info'] = mdconent
            if self.handler != None:
                self.handler([retv_i])
        if len(divs) < BLOG_NUM_PER_BLOCK:
            max_id = 0
        return max_id