def get_course_list(search_page): "Grep through the document for a list of course ids." soup = BeautifulSoup(search_page) soup.__str__(encoding='utf-8') links = soup('a', href=COURSE_LIST_REGEX) courseids = [COURSE_LIST_REGEX.search(a['href']).group('id') for a in links] return courseids
def read_chapter(urlstr): ud = get_urlstr_dictionary(urlstr) url = ud['url'][0] if(cfg.DEBUG): xbmc.log('TRANSFORMED URLSTRING %s INTO %s' % (urlstr, url)) index = int(ud['index'][0]) if(cfg.DEBUG): xbmc.log('RETRIEVED INDEX %d' % index) if(cfg.DEBUG): xbmc.log('OPENING CHAPTER FROM URL: %s' % url) response = urllib2.urlopen(url) data = response.read() selectsoup = BeautifulSoup(data, parseOnlyThese=cfg.MANGA_PAGE_SELECT_STRAINERS[index]) if(cfg.DEBUG): xbmc.log('##############################') xbmc.log(selectsoup.__str__()) count = 1 for option in selectsoup.contents[0].findAll('option'): opt_url = parse_link_url(url, option['value'], index) response = urllib2.urlopen(opt_url) data = response.read() imagesoup = BeautifulSoup(data, parseOnlyThese=cfg.MANGA_IMAGE_STRAINERS[index]) if(cfg.DEBUG): xbmc.log('######################') xbmc.log(imagesoup.__str__()) imageurl = imagesoup.contents[0]['src'] addon_gui.add_image(imageurl, count) count = count+1 addon_gui.end_image_list()
def bootstrapify_form_input(value, label): html_str = str(value) body = Bs(html_str) if body.input is not None: input_tag = body.input elif body.select is not None: input_tag = body.select elif body.textarea is not None: input_tag = body.textarea else: raise BootstrapificationError( "This is not a form input in bs_form_input") if input_tag.has_key('value'): input_tag['value'] = conditional_escape(input_tag['value']) if input_tag.has_key('class'): input_tag['class'] = ' '.join([input_tag['class'], 'form-control']) else: input_tag['class'] = 'form-control' wrapper = Bs("<div class=\"form-group\"></div>") lab = Bs("<label>" + label + "</label>") lab.label['class'] = 'control-label' lab.label['for'] = conditional_escape(input_tag['id']) wrapper.div.append(lab) wrapper.div.append(input_tag) with_errors = bootstrapify_inline_errors_in_fg( mark_safe(wrapper.__str__()), value.errors) return mark_safe(with_errors)
def input_fuzzy_search(): tag = "<input type=\"search\" class=\"fuzzy-search\" autofocus>" \ "<i class =\"fa fa-fw fa-search\"></i>" wrapper = Bs("<div class=\"searchbar\"></div>") tag = Bs(tag) wrapper.div.append(tag) return wrapper.__str__()
def clean_blog_html(body): # Clean up the HTML import re import sys from BeautifulSoup import BeautifulSoup from cStringIO import StringIO # The post body is passed to stdin. soup = BeautifulSoup(body) # Remove the permalinks to each header since the blog does not have # the styles to hide them. links = soup.findAll('a', attrs={'class':"headerlink"}) [l.extract() for l in links] # Get BeautifulSoup's version of the string s = soup.__str__(prettyPrint=False) # Remove extra newlines. This depends on the fact that # code blocks are passed through pygments, which wraps each part of the line # in a span tag. pattern = re.compile(r'([^s][^p][^a][^n]>)\n$', re.DOTALL|re.IGNORECASE) s = ''.join(pattern.sub(r'\1', l) for l in StringIO(s)) return s
def bootstrapify_wrap_in_half_column(value): html_str = str(value) body = Bs(html_str) div_wrapping = Bs("<div class=\"col-lg-6 col-md-6 col-sm-12\"></div>") div_wrapping.div.append(body) return mark_safe(div_wrapping.__str__())
def geo_term_extract(self, desc): data = values ={ 'maxRows':'1', 'fuzzy':'1', 'country':'EE', 'featureClass':'P', 'operator':'OR', 'username':self.geonames_user, 'q':desc.encode('utf-8')} data=urllib.urlencode(values) link = u"http://api.geonames.org/search" xmldata = urllib.urlopen(link, data) soup = BeautifulSoup(xmldata) # print soup.prettify() lng = '0' lat = '0' if len(soup.findAll("lat")) > 0: lng = soup.findAll("lng")[0].text lat = soup.findAll("lat")[0].text lat_f = float(lat) lng_f = float(lng) lat = '%.5f' % ((lat_f * 10000 + random.uniform(1,80))/10000) lng = '%.5f' % ((lng_f * 10000 + random.uniform(1,80))/10000) soup2 = BeautifulSoup() tag1 = Tag(soup2, "Point") tag2 = Tag(soup2, "coordinates") soup2.insert(0, tag1) tag1.insert(0, tag2) text = NavigableString(lng + "," + lat) tag2.insert(0, text) # print soup2 result = (soup2.__str__()).encode("utf-8") return [result, lat, lng]
def main(): if len(sys.argv) != 4 or (sys.argv[1] != "--extended" and sys.argv[1] != "--sanity"): help() sys.exit(-1) directory = "/home/BU_testing/test-reports/%s/%s/" % (sys.argv[2], sys.argv[3]) if sys.argv[1] == "--sanity": testReport_name = "%sBU_sanityTestReport_%s_%s.html" % ( directory, sys.argv[2], sys.argv[3]) testResult_name = "%ssanityTestResult_%s_%s.txt" % ( directory, sys.argv[2], sys.argv[3]) html_model = "model_IPA.html" else: testReport_name = "%sBU_extendedTestReport_%s_%s.html" % ( directory, sys.argv[2], sys.argv[3]) testResult_name = "%sextendedTestResult_%s_%s.txt" % ( directory, sys.argv[2], sys.argv[3]) html_model = "model_IPA.html" os.popen("mkdir %s" % directory) os.popen("cp -R /home/BU_testing/script/%s %s" % (html_model, testReport_name)) handler = open(testReport_name, 'r') data = handler.read() handler.close() soup = BeautifulSoup(data) xmls = [] for l in os.popen("ls %s" % directory).readlines(): if re.search("\.xml", l): xmls.append(directory + l.strip("\n")) generate_testResult(testResult_name, *xmls) f_handler = open(testResult_name, 'r') for i in f_handler.readlines(): update_testCase_result(i, soup) f_handler.close() add_statistic(soup) new_handler = open(testReport_name, 'w') new_handler.write(soup.__str__()) new_handler.close() if sys.argv[1] == "--sanity": if testSuite_OK(soup, "BU call sanity"): print "Congratulation!!! Test report %s generated successfully" % testReport_name print "All BU sanity test cases pass" return True else: print "Congratulation!!! Test report %s generated successfully" % testReport_name print "Not all BU sanity test cases pass!!!!" return False
def parse_chapters(data, index, url): if(cfg.DEBUG): xbmc.log('PARSING CHAPTERS:') containersoup = BeautifulSoup(data, parseOnlyThese=cfg.MANGA_CHAPTER_CONTAINER_STRAINERS[index]) for container in containersoup: linksoup = BeautifulSoup(container.__str__(), parseOnlyThese=cfg.MANGA_CHAPTER_STRAINERS[index]) xbmc.log(linksoup.__str__()) for result in linksoup: if(cfg.DEBUG): xbmc.log('FOUND CHAPTER: %s' % result) title = result.contents[0].strip() url = parse_link_url(url, result['href'], index) if(cfg.DEBUG): xbmc.log('\t TRANSFORMED CHAPTER INTO: %s (%s)' % (title, url)) addon_gui.add_chapter(title, index, url) addon_gui.end_chapter_list()
def _parseHtml(self, div_id): soup = BeautifulSoup("".join(self.htmlsourse)) self.data = str(soup.find("div", {"class":div_id})) # print self.data # print "------------" articleSoup = BeautifulSoup(self.data) self.title = articleSoup.findAll("div", {"class":"list-item2"}) # print self.title #解析分类 self.mapArticles=list() for title in self.title: # print self.title aMap = dict() aCategory = title.find("strong").getText() # print "分类:"+aCategory aDate = title.find("i").getText() # print "日期:"+aDate aNextCurl = title.find("a", {"class":"black"}).attrMap[u'href'] # print "文章地址:"+aNextCurl aMp3Url = title.find("a", href=re.compile("mp3$")) if not aMp3Url == None: aMp3Url = aMp3Url.attrMap[u'href'] aMap[u'name'] =aMp3Url.split('/').pop() else: aMap[u'name'] ="" #print aMp3Url aMap[u'category'] = aCategory aMap[u'date'] = aDate aMap[u'downMp3Url'] = aMp3Url aMap[u'articleUrl'] = u"http://www.unsv.com"+aNextCurl print aMap[u'articleUrl'] #解析文章 nexturl = aMap[u'articleUrl'] htmlsourse = urllib.urlopen(nexturl).read() nextSoup = BeautifulSoup(htmlsourse) aArticle = nextSoup.find("div", {"id":"articleFulltext"}) #取出img原始 articleSoup =BeautifulSoup(aArticle.__str__()) rmImgs=articleSoup.img if not rmImgs==None: rmImgs.extract() aMap[u'article'] = articleSoup.__str__() self.mapArticles.append(aMap)
def bootstrapify_h_form_input(value, label): html_str = bootstrapify_form_input(value, label) body = Bs(html_str) body.div.label['class'] = " ".join([body.div.label['class'], 'col-sm-4']) div_wrapping = Bs("<div class=\"col-sm-8\"></div>") if body.div.input is not None: input_tag = body.div.input.extract() elif body.div.select is not None: input_tag = body.div.select.extract() input_tag['class'] = "selectpicker" input_tag['data-live-search'] = "true" elif body.div.textarea is not None: input_tag = body.div.textarea.extract() else: raise BootstrapificationError( "Not a correct form group in bs_form_input_h") div_wrapping.div.append(input_tag) body.div.insert(1, div_wrapping) return mark_safe(body.__str__())
def geo_term_extract(self, desc): data = values = { 'maxRows': '1', 'fuzzy': '1', 'country': 'EE', 'featureClass': 'P', 'operator': 'OR', 'username': self.geonames_user, 'q': desc.encode('utf-8') } data = urllib.urlencode(values) link = u"http://api.geonames.org/search" xmldata = urllib.urlopen(link, data) soup = BeautifulSoup(xmldata) # print soup.prettify() lng = '0' lat = '0' if len(soup.findAll("lat")) > 0: lng = soup.findAll("lng")[0].text lat = soup.findAll("lat")[0].text lat_f = float(lat) lng_f = float(lng) lat = '%.5f' % ((lat_f * 10000 + random.uniform(1, 80)) / 10000) lng = '%.5f' % ((lng_f * 10000 + random.uniform(1, 80)) / 10000) soup2 = BeautifulSoup() tag1 = Tag(soup2, "Point") tag2 = Tag(soup2, "coordinates") soup2.insert(0, tag1) tag1.insert(0, tag2) text = NavigableString(lng + "," + lat) tag2.insert(0, text) # print soup2 result = (soup2.__str__()).encode("utf-8") return [result, lat, lng]
def bootstrapify_inline_errors_in_fg(value, errors): if errors is None: return value errors = str(errors) html_str = str(value) form_group = Bs(html_str) error_list = Bs(errors) if error_list.ul is None: return value # Autoescape each error text for li_error in error_list.ul.findAll('li'): span_wrap = Bs("<span class=\"element\"></span>") span_wrap.span.append(conditional_escape(li_error.text)) li_error.clear() li_error.append(span_wrap) wrapper = Bs("<div class=\"form-error-list\"></div>") wrapper.div.append(error_list) form_group.div.append(wrapper) form_group.div['class'] = " ".join( [form_group.div['class'], 'group-with-errors']) return mark_safe(form_group.__str__())
def split_html(html_filename, split_at_level=0): """ Split aggregated and rendered HTML document at some <hX> tag(s). split_at_level=0 -> split at H1 tags, split_at_level=1 -> split at H1 and H2 tags. Returns a list of dicts with keys 'html' referring to the subdocument and 'level' indicating the split point. """ destdir = os.path.dirname(html_filename) soup = BeautifulSoup(file(html_filename).read()) fp = StringIO(soup.__str__(prettyPrint=True)) docs = list() current_doc = list() for line in fp: line = line.rstrip() for level in range(split_at_level + 1): if '<h%d' % (level + 1) in line.lower(): html = '\n'.join(current_doc) root = lxml.html.fromstring(unicode(html, 'utf-8')) title = u'' h1_nodes = root.xpath('//h1') if h1_nodes: title = h1_nodes[0].text_content().strip() # count tables and images number_tables = len(root.xpath('//table')) number_images = len(CSSSelector('div.image-caption')(root)) # find all linkable nodes with an ID attribute node_ids = list() for node in root.xpath('.//*'): node_id = node.get('id') if node_id: node_ids.append(node_id) html = lxml.html.tostring(root, encoding=unicode) docs.append( dict(html=html, level=level, title=title, node_ids=node_ids, number_images=number_images, number_tables=number_tables)) current_doc = [] break current_doc.append(line) # now deal with the remaining part of the document html = '\n'.join(current_doc) root = lxml.html.fromstring(unicode(html, 'utf-8')) title = u'' h1_nodes = root.xpath('//h1') if h1_nodes: title = h1_nodes[0].text_content().strip() # count tables and images # count tables and images number_tables = len(root.xpath('//table')) number_images = len(CSSSelector('div.image-caption')(root)) # find all linkable nodes with an ID attribute node_ids = list() for node in root.xpath('.//*'): node_id = node.get('id') if node_id: node_ids.append(node_id) html = lxml.html.tostring(root, encoding=unicode) docs.append( dict(html=html, level=0, title=title, node_ids=node_ids, number_images=number_images, number_tables=number_tables)) # now store files on the filesystem ini_filename = os.path.join(destdir, 'documents.ini') fp_ini = codecs.open(ini_filename, 'w', 'utf-8') for count, d in enumerate(docs[1:]): filename = os.path.join( destdir, 'split-0/%d-level-%d.html' % (count, d['level'])) if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) file(filename, 'w').write(d['html'].encode('utf-8')) print >> fp_ini, '[%d]' % count print >> fp_ini, 'filename = %s' % filename print >> fp_ini, 'title = %s' % d['title'] print >> fp_ini, 'number_tables= %d' % d['number_tables'] print >> fp_ini, 'number_images = %d' % d['number_images'] print >> fp_ini, 'node_ids = ' for node_id in d['node_ids']: print >> fp_ini, ' ' + node_id print >> fp_ini fp_ini.close() return docs[1:]
def html2plaintext(html, body_id=None, encoding='ascii'): """Convert the HTML to plain text""" urls = [] if body_id is not None: strainer = SoupStrainer(id=body_id) else: if html.count('<body'): strainer = SoupStrainer('body') strainer = None soup = BeautifulSoup(html, parseOnlyThese=strainer, fromEncoding=encoding) for link in soup.findAll('a'): title = unicode(link.renderContents(), encoding) for url in [x[1] for x in link.attrs if x[0] == 'href']: urls.append(dict( url=url, tag=unicode(str(link), encoding), title=title) ) try: html = soup.renderContents(encoding=encoding) except AttributeError: html = soup.__str__(encoding) if isinstance(html, str) and encoding != 'ascii': html = unicode(html, encoding) url_index = [] i = 0 for d in urls: if d['title'] == d['url'] or u'http://' + d['title'] == d['url']: html = html.replace(d['tag'], d['url']) else: i += 1 html = html.replace(d['tag'], u'%s [%s]' % (d['title'], i)) url_index.append(d['url']) html = html.replace('<strong>', '*').replace('</strong>', '*') html = html.replace('<b>', '*').replace('</b>', '*') html = html.replace('<h3>', '*').replace('</h3>', '*') html = html.replace('<h2>', '**').replace('</h2>', '**') html = html.replace('<h1>', '**').replace('</h1>', '**') html = html.replace('<em>', '/').replace('</em>', '/') html = html.replace('\n', ' ') html = html.replace('<br>', '\n') html = html.replace(' ', ' ') html = html.replace('</p>', '\n\n') html = html.replace('</tr>', '\n\n') html = re.sub('<br\s*/>', '\n', html) html = html.replace(' ' * 2, ' ') def desperate_fixer(g): return ' ' html = re.sub('<.*?>', desperate_fixer, html) html = u'\n'.join([x.lstrip() for x in html.splitlines()]) # lstrip lines for i, url in enumerate(url_index): if i == 0: html += u'\n\n' html += u'[%s] %s\n' % (i + 1, url) html = unescape(html) return html
from twill import get_browser from twill.commands import * from BeautifulSoup import BeautifulSoup import html2text b = get_browser() b.set_agent_string('Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)') go("http://ddahl.com") html = b.get_html() soup = BeautifulSoup(html) h,txt = html2text.html2text(soup.__str__()) txt
class HtmlProcessor: WHITESPACE_RE = re.compile(r'\s') # Look for </blockquote <p> BAD_TAG_RE = re.compile(r'<[^>]+<', re.MULTILINE) def __init__(self, html, unfill=0): self.unfill = unfill html = self._ProcessRawHtml(html) self._soup = BeautifulSoup(html) if self._soup.title.contents: self.title = self._soup.title.contents[0] else: self.title = None def _ProcessRawHtml(self, html): new_html, count = HtmlProcessor.BAD_TAG_RE.subn('<', html) if count: print >>sys.stderr, 'Replaced %d bad tags' % count return new_html def _StubInternalAnchors(self): '''Replace each internal anchor with a fixed-size filepos anchor. Looks for every anchor with <a href="#myanchor"> and replaces that with <a filepos="00000000050">. Stores anchors in self._anchor_references''' self._anchor_references = [] anchor_num = 0 # anchor links anchorlist = self._soup.findAll('a', href=re.compile('^#')) # treat reference tags like a tags for TOCTOP. anchorlist.extend(self._soup.findAll('reference', href=re.compile('^#'))) for anchor in anchorlist: self._anchor_references.append((anchor_num, anchor['href'])) del anchor['href'] anchor['filepos'] = '%.10d' % anchor_num anchor_num += 1 def _ReplaceAnchorStubs(self): # TODO: Browsers allow extra whitespace in the href names. # use __str__ instead of prettify--it inserts extra spaces. assembled_text = self._soup.__str__('utf8') del self._soup # shouldn't touch this anymore for anchor_num, original_ref in self._anchor_references: ref = urllib.unquote(original_ref[1:]) # remove leading '#' # Find the position of ref in the utf-8 document. # TODO(chatham): Using regexes and looking for name= would be better. newpos = assembled_text.rfind(ref.encode('utf-8')) if newpos == -1: print >>sys.stderr, 'Could not find anchor "%s"' % original_ref continue newpos += len(ref) + 2 # don't point into the middle of the <a name> tag old_filepos = 'filepos="%.10d"' % anchor_num new_filepos = 'filepos="%.10d"' % newpos assert assembled_text.find(old_filepos) != -1 assembled_text = assembled_text.replace(old_filepos, new_filepos, 1) return assembled_text def _FixPreTags(self): '''Replace <pre> tags with HTML-ified text.''' pres = self._soup.findAll('pre') for pre in pres: pre.replaceWith(self._FixPreContents(str(pre.contents[0]))) def _FixPreContents(self, text): if self.unfill: line_splitter = '\n\n' line_joiner = '<p>' else: line_splitter = '\n' line_joiner = '<br>' lines = [] for line in text.split(line_splitter): lines.append(self.WHITESPACE_RE.subn(' ', line)[0]) return line_joiner.join(lines) def _RemoveUnsupported(self): '''Remove any tags which the kindle cannot handle.''' # TODO(chatham): <link> tags to script? unsupported_tags = ('script', 'style') for tag_type in unsupported_tags: for element in self._soup.findAll(tag_type): element.extract() def RenameAnchors(self, prefix): '''Rename every internal anchor to have the given prefix, then return the contents of the body tag.''' for anchor in self._soup.findAll('a', href=re.compile('^#')): anchor['href'] = '#' + prefix + anchor['href'][1:] for a in self._soup.findAll('a'): if a.get('name'): a['name'] = prefix + a['name'] # TODO(chatham): figure out how to fix this. sometimes body comes out # as NoneType. content = [] if self._soup.body is not None: content = [unicode(c) for c in self._soup.body.contents] return '\n'.join(content) def CleanHtml(self): # TODO(chatham): fix_html_br, fix_html self._RemoveUnsupported() self._StubInternalAnchors() self._FixPreTags() return self._ReplaceAnchorStubs()
def html2plaintext(html, body_id=None, encoding='ascii'): """ from an HTML text, convert the HTML to plain text. If @body_id is provided then this is the tag where the body (not necessarily <body>) starts. """ urls = [] if body_id is not None: strainer = SoupStrainer(id=body_id) else: strainer = SoupStrainer('body') soup = BeautifulSoup(html, parseOnlyThese=strainer, fromEncoding=encoding) for link in soup.findAll('a'): title = link.renderContents() for url in [x[1] for x in link.attrs if x[0]=='href']: urls.append(dict(url=url, tag=str(link), title=title)) html = soup.__str__(encoding) url_index = [] i = 0 for d in urls: if d['title'] == d['url'] or 'http://'+d['title'] == d['url']: html = html.replace(d['tag'], d['url']) else: i += 1 html = html.replace(d['tag'], '%s [%s]' % (d['title'], i)) url_index.append(d['url']) html = html.replace('<strong>','*').replace('</strong>','*') html = html.replace('<b>','*').replace('</b>','*') html = html.replace('<h3>','*').replace('</h3>','*') html = html.replace('<h2>','**').replace('</h2>','**') html = html.replace('<h1>','**').replace('</h1>','**') html = html.replace('<em>','/').replace('</em>','/') # the only line breaks we respect is those of ending tags and # breaks #html = html.replace('\n',' ') html = html.replace('<br>', '\n') html = html.replace('</p>', '\n\n') html = re.sub('<br\s*/>', '\n', html) html = html.replace(' ' * 2, ' ') # for all other tags we failed to clean up, just remove then and # complain about them on the stderr def desperate_fixer(g): #print >>sys.stderr, "failed to clean up %s" % str(g.group()) return ' ' html = re.sub('<.*?>', desperate_fixer, html) # lstrip all lines html = '\n'.join([x.lstrip() for x in html.splitlines()]) for i, url in enumerate(url_index): if i == 0: html += '\n\n' html += '[%s] %s\n' % (i+1, url) html = unescape(html) return html
def html2plaintext(html, body_id=None, encoding='ascii'): """ from an HTML text, convert the HTML to plain text. If @body_id is provided then this is the tag where the body (not necessarily <body>) starts. """ urls = [] if body_id is not None: strainer = SoupStrainer(id=body_id) else: strainer = SoupStrainer('body') soup = BeautifulSoup(html, parseOnlyThese=strainer, fromEncoding=encoding) for link in soup.findAll('a'): title = link.renderContents() for url in [x[1] for x in link.attrs if x[0]=='href']: urls.append(dict(url=url, tag=str(link), title=title)) html = soup.__str__(encoding) url_index = [] i = 0 for d in urls: if d['title'] == d['url'] or 'http://'+d['title'] == d['url']: html = html.replace(d['tag'], d['url']) else: i += 1 html = html.replace(d['tag'], '%s [%s]' % (d['title'], i)) url_index.append(d['url']) html = html.replace('<strong>','*').replace('</strong>','*') html = html.replace('<b>','*').replace('</b>','*') html = html.replace('<h3>','*').replace('</h3>','*') html = html.replace('<h2>','**').replace('</h2>','**') html = html.replace('<h1>','**').replace('</h1>','**') html = html.replace('<em>','/').replace('</em>','/') # the only line breaks we respect is those of ending tags and # breaks html = html.replace('\n',' ') html = html.replace('<br>', '\n') html = html.replace('</p>', '\n\n') html = re.sub('<br\s*/>', '\n', html) html = html.replace(' ' * 2, ' ') # for all other tags we failed to clean up, just remove then and # complain about them on the stderr def desperate_fixer(g): #print >>sys.stderr, "failed to clean up %s" % str(g.group()) return ' ' html = re.sub('<.*?>', desperate_fixer, html) # lstrip all lines html = '\n'.join([x.lstrip() for x in html.splitlines()]) for i, url in enumerate(url_index): if i == 0: html += '\n\n' html += '[%s] %s\n' % (i+1, url) html = unescape(html) return html
def split_html(html_filename, split_at_level=0): """ Split aggregated and rendered HTML document at some <hX> tag(s). split_at_level=0 -> split at H1 tags, split_at_level=1 -> split at H1 and H2 tags. Returns a list of dicts with keys 'html' referring to the subdocument and 'level' indicating the split point. """ destdir = os.path.dirname(html_filename) soup = BeautifulSoup(file(html_filename).read()) fp = StringIO(soup.__str__(prettyPrint=True)) docs = list() current_doc = list() for line in fp: line = line.rstrip() for level in range(split_at_level+1): if '<h%d' % (level+1) in line.lower(): html = '\n'.join(current_doc) root = lxml.html.fromstring(unicode(html, 'utf-8')) title = u'' h1_nodes = root.xpath('//h1') if h1_nodes: title = h1_nodes[0].text_content().strip() # count tables and images number_tables = len(root.xpath('//table')) number_images = len(CSSSelector('div.image-caption')(root)) # find all linkable nodes with an ID attribute node_ids = list() for node in root.xpath('.//*'): node_id = node.get('id') if node_id: node_ids.append(node_id) html = lxml.html.tostring(root, encoding=unicode) docs.append(dict(html=html, level=level, title=title, node_ids=node_ids, number_images=number_images, number_tables=number_tables)) current_doc = [] break current_doc.append(line) # now deal with the remaining part of the document html = '\n'.join(current_doc) root = lxml.html.fromstring(unicode(html, 'utf-8')) title = u'' h1_nodes = root.xpath('//h1') if h1_nodes: title = h1_nodes[0].text_content().strip() # count tables and images # count tables and images number_tables = len(root.xpath('//table')) number_images = len(CSSSelector('div.image-caption')(root)) # find all linkable nodes with an ID attribute node_ids = list() for node in root.xpath('.//*'): node_id = node.get('id') if node_id: node_ids.append(node_id) html = lxml.html.tostring(root, encoding=unicode) docs.append(dict(html=html, level=0, title=title, node_ids=node_ids, number_images=number_images, number_tables=number_tables)) # now store files on the filesystem ini_filename = os.path.join(destdir, 'documents.ini') fp_ini = codecs.open(ini_filename, 'w', 'utf-8') for count, d in enumerate(docs[1:]): filename = os.path.join(destdir, 'split-0/%d-level-%d.html' % (count, d['level'])) if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) file(filename, 'w').write(d['html'].encode('utf-8')) print >>fp_ini, '[%d]' % count print >>fp_ini, 'filename = %s' % filename print >>fp_ini, 'title = %s' % d['title'] print >>fp_ini, 'number_tables= %d' % d['number_tables'] print >>fp_ini, 'number_images = %d' % d['number_images'] print >>fp_ini, 'node_ids = ' for node_id in d['node_ids']: print >>fp_ini, ' ' + node_id print >>fp_ini fp_ini.close() return docs[1:]
class HtmlProcessor: WHITESPACE_RE = re.compile(r'\s') # Look for </blockquote <p> BAD_TAG_RE = re.compile(r'<[^>]+<', re.MULTILINE) def __init__(self, html, unfill=0): self.unfill = unfill html = self._ProcessRawHtml(html) self._soup = BeautifulSoup(html) if self._soup.title.contents: self.title = self._soup.title.contents[0] else: self.title = None def _ProcessRawHtml(self, html): new_html, count = HtmlProcessor.BAD_TAG_RE.subn('<', html) if count: print >> sys.stderr, 'Replaced %d bad tags' % count return new_html def _StubInternalAnchors(self): '''Replace each internal anchor with a fixed-size filepos anchor. Looks for every anchor with <a href="#myanchor"> and replaces that with <a filepos="00000000050">. Stores anchors in self._anchor_references''' self._anchor_references = [] anchor_num = 0 # anchor links anchorlist = self._soup.findAll('a', href=re.compile('^#')) # treat reference tags like a tags for TOCTOP. anchorlist.extend( self._soup.findAll('reference', href=re.compile('^#'))) for anchor in anchorlist: self._anchor_references.append((anchor_num, anchor['href'])) del anchor['href'] anchor['filepos'] = '%.10d' % anchor_num anchor_num += 1 def _ReplaceAnchorStubs(self): # TODO: Browsers allow extra whitespace in the href names. # use __str__ instead of prettify--it inserts extra spaces. assembled_text = self._soup.__str__('utf8') del self._soup # shouldn't touch this anymore for anchor_num, original_ref in self._anchor_references: ref = urllib.unquote(original_ref[1:]) # remove leading '#' # Find the position of ref in the utf-8 document. # TODO(chatham): Using regexes and looking for name= would be better. newpos = assembled_text.rfind(ref.encode('utf-8')) if newpos == -1: print >> sys.stderr, 'Could not find anchor "%s"' % original_ref continue newpos += len( ref) + 2 # don't point into the middle of the <a name> tag old_filepos = 'filepos="%.10d"' % anchor_num new_filepos = 'filepos="%.10d"' % newpos assert assembled_text.find(old_filepos) != -1 assembled_text = assembled_text.replace(old_filepos, new_filepos, 1) return assembled_text def _FixPreTags(self): '''Replace <pre> tags with HTML-ified text.''' pres = self._soup.findAll('pre') for pre in pres: pre.replaceWith(self._FixPreContents(unicode(pre.contents[0]))) def _FixPreContents(self, text): if self.unfill: line_splitter = '\n\n' line_joiner = '<p>' else: line_splitter = '\n' line_joiner = '<br>' lines = [] for line in text.split(line_splitter): lines.append(self.WHITESPACE_RE.subn(' ', line)[0]) return line_joiner.join(lines) def _RemoveUnsupported(self): '''Remove any tags which the kindle cannot handle.''' # TODO(chatham): <link> tags to script? unsupported_tags = ('script', 'style') for tag_type in unsupported_tags: for element in self._soup.findAll(tag_type): element.extract() def RenameAnchors(self, prefix): '''Rename every internal anchor to have the given prefix, then return the contents of the body tag.''' for anchor in self._soup.findAll('a', href=re.compile('^#')): anchor['href'] = '#' + prefix + anchor['href'][1:] for a in self._soup.findAll('a'): if a.get('name'): a['name'] = prefix + a['name'] # TODO(chatham): figure out how to fix this. sometimes body comes out # as NoneType. content = [] if self._soup.body is not None: content = [unicode(c) for c in self._soup.body.contents] return '\n'.join(content) def CleanHtml(self): # TODO(chatham): fix_html_br, fix_html self._RemoveUnsupported() self._StubInternalAnchors() self._FixPreTags() return self._ReplaceAnchorStubs()
def html2plaintext(html, body_id=None, encoding='utf8', width=80): """ from an HTML text, convert the HTML to plain text. If @body_id is provided then this is the tag where the body (not necessarily <body>) starts. """ if encoding == 'utf8': from django.utils.safestring import SafeUnicode html = SafeUnicode(html) from django.utils.encoding import force_unicode html = force_unicode(html) html = html.encode('ascii', 'xmlcharrefreplace') urls = [] if body_id is not None: strainer = SoupStrainer(id=body_id) else: strainer = SoupStrainer('body') soup = BeautifulSoup(html, parseOnlyThese=strainer, fromEncoding=encoding) for link in soup.findAll('a'): title = link.renderContents() for url in [x[1] for x in link.attrs if x[0]=='href']: urls.append(dict(url=str(url), tag=str(link), title=str(title))) html = soup.__str__(encoding) url_index = [] i = 0 for d in urls: if d['title'] == d['url'] or 'http://'+d['title'] == d['url']: html = html.replace(d['tag'], d['url']) elif d['url'].startswith('#'): # don't show anchor content html = html.replace(d['tag'], '') else: i += 1 html = html.replace(d['tag'], '%s [%s]' % (d['title'], i)) url_index.append(d['url']) #html = html.replace('<strong>','*').replace('</strong>','*') #html = html.replace('<b>','*').replace('</b>','*') #html = html.replace('<h3>','*').replace('</h3>','*') #html = html.replace('<h2>','**').replace('</h2>','**') #html = html.replace('<h1>','**').replace('</h1>','**') #html = html.replace('<em>','/').replace('</em>','/') # the only line breaks we respect is those of ending tags and # breaks html = html.replace('\n',' ') html = html.replace('<br>', '\n') html = html.replace('</p>', '\n\n') html = re.sub('<br\s*/>', '\n', html) html = html.replace('</tr>', '\n') #html = html.replace('</table>', '\n\n') html = html.replace(' ' * 2, ' ') # for all other tags we failed to clean up, just remove then and # complain about them on the stderr def desperate_fixer(g): #print >>sys.stderr, "failed to clean up %s" % str(g.group()) return ' ' html = re.sub('<.*?>', desperate_fixer, html) # lstrip all lines html = '\n'.join([x.lstrip() for x in html.splitlines()]) for i, url in enumerate(url_index): if i == 0: html += '\n\n' html += '[%s] %s\n' % (i+1, url) html = unescape(html) # reduce consecutive empty lines to one pat = re.compile(r'(\n\s*\n)+', re.M) html = pat.sub('\n\n', html) # wrap long lines #html = word_wrap(html, width) # Use the python TextWrapper instead of the builtin function wrapper = TextWrapper(width=80) html = wrapper.fill(html) return html