def filter_images( self,imgs ): srcs = [] images = [] for img in imgs: if img.has_key('src'): src = img['src'] if not src.lower().startswith('http://'): src = relative2absolute( self.url,src ) # 判断图片大小,太小不要 try: im = urlopen( src ).read() if len(im)>MIN_IMG_SIZE: srcs.append( src ) #img['src'] = src images.append( img ) except IOError: pass return (srcs,images)
def filter_images(self, imgs): srcs = [] images = [] for img in imgs: if img.has_key('src'): src = img['src'] if not src.lower().startswith('http://'): src = relative2absolute(self.url, src) # 判断图片大小,太小不要 try: im = urlopen(src).read() if len(im) > MIN_IMG_SIZE: srcs.append(src) #img['src'] = src images.append(img) except IOError: pass return (srcs, images)
def insert_images(self,block,images): start = self.title end = block.text_list()[-1] behind_img = False #block.print_ns() i = 0 # 记录block中文本编号 while start!=end: if not isinstance(start,NavigableString) : if start.name=='img' and start in images: src = start['src'] if not src.lower().startswith('http://'): start['src'] = relative2absolute( self.url,src ) #print i,":",str(start),"[]" block.insert( i,start ) #block.print_ns() i += 1 behind_img = True elif start.name=='br': #print i,":",str(start),"[]" # 加入换行符 block.insert( i,start ) #block.print_ns() i += 1 elif start.name in BLOCK_TAGS: behind_img = False # NavigableString elif start.string.strip(): # 已经在正文块中 if start in block.text_list(): #print i,":",start.string i += 1 behind_img = False # 不在正文块中,在图片后的兄弟文本 elif behind_img: #print i,":",start.string,"[]" block.insert( i,start ) #block.print_ns() i += 1 start = start.next return block
def insert_images(self, block, images): start = self.title end = block.text_list()[-1] behind_img = False #block.print_ns() i = 0 # 记录block中文本编号 while start != end: if not isinstance(start, NavigableString): if start.name == 'img' and start in images: src = start['src'] if not src.lower().startswith('http://'): start['src'] = relative2absolute(self.url, src) #print i,":",str(start),"[]" block.insert(i, start) #block.print_ns() i += 1 behind_img = True elif start.name == 'br': #print i,":",str(start),"[]" # 加入换行符 block.insert(i, start) #block.print_ns() i += 1 elif start.name in BLOCK_TAGS: behind_img = False # NavigableString elif start.string.strip(): # 已经在正文块中 if start in block.text_list(): #print i,":",start.string i += 1 behind_img = False # 不在正文块中,在图片后的兄弟文本 elif behind_img: #print i,":",start.string,"[]" block.insert(i, start) #block.print_ns() i += 1 start = start.next return block