Ejemplo n.º 1
0
 def __init__(self):
     self.permit = WebPermit()
     self.content = None
     self.text = []
     self.blocksets = []
     self.thresold, self.blocksize = 86, 3
	def __init__(self):
		self.permit=WebPermit()
		self.content=None
		self.text=[]
		self.blocksets=[]
		self.thresold,self.blocksize=86,3
Ejemplo n.º 3
0
class WebPageParser(object):
    def __init__(self):
        self.permit = WebPermit()
        self.content = None
        self.text = []
        self.blocksets = []
        self.thresold, self.blocksize = 86, 3

    def get_text(self, htmldoc):
        'get all words text in page'
        puretext = htmldoc.split('\n')
        textsize, nowline = len(puretext), 0
        for text in puretext:
            realtext = re.sub('\s', '', text)
            ituple = (text, len(realtext.strip()))
            self.text.append(ituple)
            if nowline > textsize // 3:
                udata = realtext.strip().decode('utf-8')
                self.permit.extract_permit(udata)
            nowline += 1

    def get_blocksets(self):
        'count for block length'
        lines = len(self.text)
        for i in range(self.blocksize):
            self.blocksets.append(0)
        for i in range(self.blocksize, lines):
            blockcapacity = 0
            for ib in range(self.blocksize):
                blockcapacity += self.text[i - ib][1]
            self.blocksets.append(blockcapacity)
        self.get_content_coord()

    def get_content(self, start, end):
        'get real content'
        if self.content:
            self.content = ''
            for i, ituple in enumerate(self.text[start:end + 1]):
                self.content += ituple[0]

    def get_content_coord(self):
        'get content position x,y'
        setsize = len(self.blocksets)
        maxvalue = max(self.blocksets)
        maxpos = self.blocksets.index(maxvalue)
        if maxpos > setsize * 3 // 4: return
        if maxvalue < 500:
            count = 0
            devirate, maxset = [], []
            for i in range(setsize - 1):
                devirate.append(self.blocksets[i + 1] - self.blocksets[i])
            for i in range(1, setsize - 1):
                if devirate[i] < 0 and devirate[i - 1] > 0:
                    maxset.append(self.blocksets[i])
            for i in sorted(maxset):
                if 0 <= maxvalue - i < maxvalue * 0.4:
                    count += 1
                    if count == 5: return
        start = end = -1
        for x, y in enumerate(self.blocksets[:maxpos + 1]):
            if start == -1 and y > self.thresold:
                check = 1
                if setsize <= x + self.blocksize: return
                for i in range(self.blocksize + 1):
                    check *= self.blocksets[x + i]
                if not (x <= maxpos <= x + self.blocksize) and not check:
                    continue
                start = x
        if start != -1:
            if x + start + 2 >= setsize or x + start + 1 >= setsize: return
            for x, y in enumerate(self.blocksets[start + 1:]):
                if x + start + 2 >= setsize or x + start + 1 >= setsize: break
                if not self.blocksets[x+start] and not self.blocksets[x+start+1] \
                and not self.blocksets[x+start+2]:
                    end = 1 + start + x
                    break
        start -= self.blocksize // 2
        if start != -1 and end != -1 and maxvalue > 200:
            if start < maxpos <= end:
                if maxvalue / (end - start) > 6:
                    self.content = 'ok'
            else:
                if maxvalue > 200: self.content = 'ok'
        if self.content: self.get_content(start, end)
class WebPageParser(object):
	def __init__(self):
		self.permit=WebPermit()
		self.content=None
		self.text=[]
		self.blocksets=[]
		self.thresold,self.blocksize=86,3
	def get_text(self,htmldoc):
		'get all words text in page'
		puretext=htmldoc.split('\n')
		textsize,nowline=len(puretext),0
		for text in puretext:
			realtext=re.sub('\s','',text)
			ituple=(text,len(realtext.strip()))
			self.text.append(ituple)
			if nowline>textsize//3:
				udata=realtext.strip().decode('utf-8')
				self.permit.extract_permit(udata)
			nowline+=1
	def get_blocksets(self):
		'count for block length'
		lines=len(self.text)
		for i in range(self.blocksize):
			self.blocksets.append(0)
		for i in range(self.blocksize,lines):
			blockcapacity=0
			for ib in range(self.blocksize):
				blockcapacity+=self.text[i-ib][1]
			self.blocksets.append(blockcapacity)
		self.get_content_coord()
	def get_content(self,start,end):
		'get real content'
		if self.content:
			self.content=''
			for i,ituple in enumerate(self.text[start:end+1]):
				self.content+=ituple[0]
	def get_content_coord(self):
		'get content position x,y'
		setsize=len(self.blocksets)
		maxvalue=max(self.blocksets)
		maxpos=self.blocksets.index(maxvalue)
		if maxpos>setsize*3//4: return
		if maxvalue<500:
			count=0
			devirate,maxset=[],[]
			for i in range(setsize-1):
				devirate.append(self.blocksets[i+1]-self.blocksets[i])
			for i in range(1,setsize-1):
				if devirate[i]<0 and devirate[i-1]>0:
					maxset.append(self.blocksets[i])
			for i in sorted(maxset):
				if 0<=maxvalue-i<maxvalue*0.4:
					count+=1
					if count==5: return
		start=end=-1
		for x,y in enumerate(self.blocksets[:maxpos+1]):
			if start==-1 and y>self.thresold:
				check=1
				if setsize<=x+self.blocksize: return
				for i in range(self.blocksize+1):
					check*=self.blocksets[x+i] 
				if not (x<=maxpos<=x+self.blocksize) and not check: continue
				start=x
		if start!=-1:
			if x+start+2>=setsize or x+start+1>=setsize: return
			for x,y in enumerate(self.blocksets[start+1:]):
				if x+start+2>=setsize or x+start+1>=setsize: break
				if not self.blocksets[x+start] and not self.blocksets[x+start+1] \
				and not self.blocksets[x+start+2]:
					end=1+start+x
					break
		start-=self.blocksize//2
		if start!=-1 and end!=-1 and maxvalue>200:
			if start<maxpos<=end:
				if maxvalue/(end-start)>6:
					self.content='ok'
			else:
				if maxvalue>200: self.content='ok'
		if self.content: self.get_content(start,end)