def get_onepage_poclist(page): info = getHtml("http://beebeeto.com/pdb" + '/?page=' + str(page)) if '' == info: return '' bt = BeautifulSoup(info) end = bt.find( 'a', { 'style': "font-size: 20px;font-weight: bold; border-bottom: 3px solid #777777;" }) if '1' == end.renderContents() and page != 1: return '' ret = bt.find('div', {'class': 'mainlist'}) ret = ret.renderContents() if ret == "": return "" retlist = [] rets = re.findall('<a href=.*?>', ret) for one in rets: if "poc-" in one: one = one.replace('<a href="', "") one = one.replace('">', "") one = one.strip() retlist.append(one) return retlist
def strip_html_and_tags(s, invalid_tags): ''' content between "invalid_tags" is removed ''' if not s: return s from util.BeautifulSoup import BeautifulSoup soup = BeautifulSoup( s.replace('<br>', '\n').replace('<br/>', '\n').replace('<br />', '\n')) for tag in invalid_tags: for result in soup.findAll(name=tag): result.replaceWith("") return ''.join(e for e in soup.recursiveChildGenerator() if isinstance(e, unicode))
def getPoc(poc): info = getHtml("http://beebeeto.com/pdb/" + poc + "/") if '' == info: return '' if '<img src="/static/img/test.jpg"' in info: return '' if "立即查看" in info: return '' try: bt = BeautifulSoup(info) ret = bt.find('pre', {'class': "brush: python;"}) ret = ret.renderContents() if ret: return strip_tags(ret) else: return '' except: return ''
def strip_html2(s): ''' Strips out HTML with the BeautifulSoup library. >>> strip_html2('<html><body><b>Some <i>ugly</i></b> html.</body></html>') u'Some ugly html.' ''' if not s: return s from util.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(s) text_pieces = [] for pc in soup.recursiveChildGenerator(): if isinstance(pc, unicode): text_pieces.append(pc) elif pc.name == 'br': text_pieces.append('\n') return ''.join(text_pieces)
def odc_body_received(self, data): info('odc_body_received') # Get a place to store the images. import stdpaths assetdir = stdpaths.userdata # Did the message include an inline image? if '<BINARY>' in data: j = data.find('<BINARY>') # Parse the HTML _before_ <BINARY> soup = BeautifulSoup(data[:j]) for img in soup.html.body('img'): # may have more than one <img> # For each <IMG> tag imgdata = data[j:] findme = ' ID="%s" SIZE="%s">' % (str( img['id']), str(img['datasize'])) i = imgdata.find(findme) imgbytes = imgdata[i + len(findme):int(img['datasize']) + 33] # os.path.split the img src, because some clients send their # full paths. (file:///c:/blah.jpg) imgpath = os.path.join(assetdir, os.path.split(img['src'])[1]) img['src'] = imgpath del img['width'] del img['height'] with open(imgpath, 'wb') as f: f.write(imgbytes) msg = unicode(soup.html) else: msg = data self.convo.incoming_message(self.screenname, msg) self.socket.receive_next(ODCHeader, self.odc_header_received)