Example #1
0
def get_course_list(search_page):
  "Grep through the document for a list of course ids."
  soup = BeautifulSoup(search_page)
  soup.__str__(encoding='utf-8')
  links = soup('a', href=COURSE_LIST_REGEX)
  courseids = [COURSE_LIST_REGEX.search(a['href']).group('id') for a in links]
  return courseids
def read_chapter(urlstr):
    ud = get_urlstr_dictionary(urlstr)
    url = ud['url'][0]
    if(cfg.DEBUG):
        xbmc.log('TRANSFORMED URLSTRING %s INTO %s' % (urlstr, url))
    index = int(ud['index'][0])
    if(cfg.DEBUG):
        xbmc.log('RETRIEVED INDEX %d' % index)
    if(cfg.DEBUG):
        xbmc.log('OPENING CHAPTER FROM URL: %s' % url)
    response = urllib2.urlopen(url)
    data = response.read()
    selectsoup = BeautifulSoup(data, parseOnlyThese=cfg.MANGA_PAGE_SELECT_STRAINERS[index])
    if(cfg.DEBUG):
        xbmc.log('##############################')
        xbmc.log(selectsoup.__str__())
    count = 1
    for option in selectsoup.contents[0].findAll('option'):
        opt_url = parse_link_url(url, option['value'], index)
        response = urllib2.urlopen(opt_url)
        data = response.read()
        imagesoup = BeautifulSoup(data, parseOnlyThese=cfg.MANGA_IMAGE_STRAINERS[index])
        if(cfg.DEBUG):
            xbmc.log('######################')
            xbmc.log(imagesoup.__str__())
        imageurl = imagesoup.contents[0]['src']
        addon_gui.add_image(imageurl, count)
        count = count+1
    addon_gui.end_image_list()
Example #3
0
def bootstrapify_form_input(value, label):
    html_str = str(value)
    body = Bs(html_str)

    if body.input is not None:
        input_tag = body.input
    elif body.select is not None:
        input_tag = body.select
    elif body.textarea is not None:
        input_tag = body.textarea
    else:
        raise BootstrapificationError(
            "This is not a form input in bs_form_input")

    if input_tag.has_key('value'):
        input_tag['value'] = conditional_escape(input_tag['value'])

    if input_tag.has_key('class'):
        input_tag['class'] = ' '.join([input_tag['class'], 'form-control'])
    else:
        input_tag['class'] = 'form-control'

    wrapper = Bs("<div class=\"form-group\"></div>")
    lab = Bs("<label>" + label + "</label>")
    lab.label['class'] = 'control-label'
    lab.label['for'] = conditional_escape(input_tag['id'])
    wrapper.div.append(lab)
    wrapper.div.append(input_tag)

    with_errors = bootstrapify_inline_errors_in_fg(
        mark_safe(wrapper.__str__()), value.errors)

    return mark_safe(with_errors)
Example #4
0
def input_fuzzy_search():
    tag = "<input type=\"search\" class=\"fuzzy-search\" autofocus>" \
          "<i class =\"fa fa-fw fa-search\"></i>"
    wrapper = Bs("<div class=\"searchbar\"></div>")
    tag = Bs(tag)
    wrapper.div.append(tag)
    return wrapper.__str__()
Example #5
0
def clean_blog_html(body):
    # Clean up the HTML
    import re
    import sys
    from BeautifulSoup import BeautifulSoup
    from cStringIO import StringIO

    # The post body is passed to stdin.
    soup = BeautifulSoup(body)

    # Remove the permalinks to each header since the blog does not have
    # the styles to hide them.
    links = soup.findAll('a', attrs={'class':"headerlink"})
    [l.extract() for l in links]

    # Get BeautifulSoup's version of the string
    s = soup.__str__(prettyPrint=False)

    # Remove extra newlines.  This depends on the fact that
    # code blocks are passed through pygments, which wraps each part of the line
    # in a span tag.
    pattern = re.compile(r'([^s][^p][^a][^n]>)\n$', re.DOTALL|re.IGNORECASE)
    s = ''.join(pattern.sub(r'\1', l) for l in StringIO(s))
    
    return s
Example #6
0
def bootstrapify_wrap_in_half_column(value):
    html_str = str(value)
    body = Bs(html_str)
    div_wrapping = Bs("<div class=\"col-lg-6 col-md-6 col-sm-12\"></div>")
    div_wrapping.div.append(body)

    return mark_safe(div_wrapping.__str__())
 def geo_term_extract(self, desc):
     data = values ={
              'maxRows':'1',
              'fuzzy':'1',
              'country':'EE',
              'featureClass':'P',
              'operator':'OR',
              'username':self.geonames_user,
              'q':desc.encode('utf-8')}
     data=urllib.urlencode(values)
 
     link = u"http://api.geonames.org/search"
     xmldata = urllib.urlopen(link, data)
     soup = BeautifulSoup(xmldata)
 #   print soup.prettify()
     lng = '0'
     lat = '0'
     if len(soup.findAll("lat")) > 0:
         lng = soup.findAll("lng")[0].text
         lat = soup.findAll("lat")[0].text
         lat_f = float(lat)
         lng_f = float(lng)
         lat = '%.5f' % ((lat_f * 10000 + random.uniform(1,80))/10000)
         lng = '%.5f' % ((lng_f * 10000 + random.uniform(1,80))/10000)
     
     soup2 = BeautifulSoup()
     tag1 = Tag(soup2, "Point")
     tag2 = Tag(soup2, "coordinates")
     soup2.insert(0, tag1)
     tag1.insert(0, tag2)
     text = NavigableString(lng + "," + lat)
     tag2.insert(0, text)
 #   print soup2
     result = (soup2.__str__()).encode("utf-8")
     return [result, lat, lng]
Example #8
0
def main():
    if len(sys.argv) != 4 or (sys.argv[1] != "--extended"
                              and sys.argv[1] != "--sanity"):
        help()
        sys.exit(-1)

    directory = "/home/BU_testing/test-reports/%s/%s/" % (sys.argv[2],
                                                          sys.argv[3])

    if sys.argv[1] == "--sanity":
        testReport_name = "%sBU_sanityTestReport_%s_%s.html" % (
            directory, sys.argv[2], sys.argv[3])
        testResult_name = "%ssanityTestResult_%s_%s.txt" % (
            directory, sys.argv[2], sys.argv[3])
        html_model = "model_IPA.html"
    else:
        testReport_name = "%sBU_extendedTestReport_%s_%s.html" % (
            directory, sys.argv[2], sys.argv[3])
        testResult_name = "%sextendedTestResult_%s_%s.txt" % (
            directory, sys.argv[2], sys.argv[3])
        html_model = "model_IPA.html"

    os.popen("mkdir %s" % directory)
    os.popen("cp -R /home/BU_testing/script/%s %s" %
             (html_model, testReport_name))
    handler = open(testReport_name, 'r')
    data = handler.read()
    handler.close()
    soup = BeautifulSoup(data)

    xmls = []
    for l in os.popen("ls %s" % directory).readlines():
        if re.search("\.xml", l):
            xmls.append(directory + l.strip("\n"))

    generate_testResult(testResult_name, *xmls)
    f_handler = open(testResult_name, 'r')
    for i in f_handler.readlines():
        update_testCase_result(i, soup)
    f_handler.close()

    add_statistic(soup)
    new_handler = open(testReport_name, 'w')
    new_handler.write(soup.__str__())
    new_handler.close()
    if sys.argv[1] == "--sanity":
        if testSuite_OK(soup, "BU call sanity"):
            print "Congratulation!!! Test report %s generated successfully" % testReport_name
            print "All BU sanity test cases pass"
            return True
        else:
            print "Congratulation!!! Test report %s generated successfully" % testReport_name
            print "Not all BU sanity test cases pass!!!!"
            return False
def parse_chapters(data, index, url):
    if(cfg.DEBUG):
        xbmc.log('PARSING CHAPTERS:')
    containersoup = BeautifulSoup(data, parseOnlyThese=cfg.MANGA_CHAPTER_CONTAINER_STRAINERS[index])
    for container in containersoup:
        linksoup = BeautifulSoup(container.__str__(), parseOnlyThese=cfg.MANGA_CHAPTER_STRAINERS[index])
        xbmc.log(linksoup.__str__())
        for result in linksoup:
            if(cfg.DEBUG):
                xbmc.log('FOUND CHAPTER: %s' % result)
            title = result.contents[0].strip()
            url = parse_link_url(url, result['href'], index)
            if(cfg.DEBUG):
                xbmc.log('\t TRANSFORMED CHAPTER INTO: %s (%s)' % (title, url))
            addon_gui.add_chapter(title, index, url)
    addon_gui.end_chapter_list()
Example #10
0
    def _parseHtml(self, div_id):
        soup = BeautifulSoup("".join(self.htmlsourse))
        self.data = str(soup.find("div", {"class":div_id}))
#       print self.data
#       print "------------"
        
        articleSoup = BeautifulSoup(self.data)
        self.title = articleSoup.findAll("div", {"class":"list-item2"})
#        print self.title
		#解析分类
        self.mapArticles=list()
        for title in self.title:
    #        print self.title
            aMap = dict()
            aCategory = title.find("strong").getText()
    #        print "分类:"+aCategory
            aDate = title.find("i").getText()
    #        print "日期:"+aDate
            aNextCurl = title.find("a", {"class":"black"}).attrMap[u'href']
    #        print "文章地址:"+aNextCurl
            aMp3Url = title.find("a", href=re.compile("mp3$"))
            if not aMp3Url == None:
                aMp3Url = aMp3Url.attrMap[u'href']
                aMap[u'name'] =aMp3Url.split('/').pop()
            else:
                aMap[u'name'] =""
            #print aMp3Url
            aMap[u'category'] = aCategory
            aMap[u'date'] = aDate
            aMap[u'downMp3Url'] = aMp3Url
            aMap[u'articleUrl'] = u"http://www.unsv.com"+aNextCurl
            print aMap[u'articleUrl']
            
            #解析文章
            nexturl = aMap[u'articleUrl']
            htmlsourse = urllib.urlopen(nexturl).read()
            nextSoup = BeautifulSoup(htmlsourse)
            aArticle = nextSoup.find("div", {"id":"articleFulltext"})
            
            #取出img原始
            articleSoup =BeautifulSoup(aArticle.__str__())
            rmImgs=articleSoup.img
            if not rmImgs==None:
                rmImgs.extract()
            aMap[u'article'] = articleSoup.__str__()
            
            self.mapArticles.append(aMap)
Example #11
0
def bootstrapify_h_form_input(value, label):
    html_str = bootstrapify_form_input(value, label)
    body = Bs(html_str)
    body.div.label['class'] = " ".join([body.div.label['class'], 'col-sm-4'])
    div_wrapping = Bs("<div class=\"col-sm-8\"></div>")
    if body.div.input is not None:
        input_tag = body.div.input.extract()
    elif body.div.select is not None:
        input_tag = body.div.select.extract()
        input_tag['class'] = "selectpicker"
        input_tag['data-live-search'] = "true"
    elif body.div.textarea is not None:
        input_tag = body.div.textarea.extract()
    else:
        raise BootstrapificationError(
            "Not a correct form group in bs_form_input_h")

    div_wrapping.div.append(input_tag)
    body.div.insert(1, div_wrapping)

    return mark_safe(body.__str__())
    def geo_term_extract(self, desc):
        data = values = {
            'maxRows': '1',
            'fuzzy': '1',
            'country': 'EE',
            'featureClass': 'P',
            'operator': 'OR',
            'username': self.geonames_user,
            'q': desc.encode('utf-8')
        }
        data = urllib.urlencode(values)

        link = u"http://api.geonames.org/search"
        xmldata = urllib.urlopen(link, data)
        soup = BeautifulSoup(xmldata)
        #   print soup.prettify()
        lng = '0'
        lat = '0'
        if len(soup.findAll("lat")) > 0:
            lng = soup.findAll("lng")[0].text
            lat = soup.findAll("lat")[0].text
            lat_f = float(lat)
            lng_f = float(lng)
            lat = '%.5f' % ((lat_f * 10000 + random.uniform(1, 80)) / 10000)
            lng = '%.5f' % ((lng_f * 10000 + random.uniform(1, 80)) / 10000)

        soup2 = BeautifulSoup()
        tag1 = Tag(soup2, "Point")
        tag2 = Tag(soup2, "coordinates")
        soup2.insert(0, tag1)
        tag1.insert(0, tag2)
        text = NavigableString(lng + "," + lat)
        tag2.insert(0, text)
        #   print soup2
        result = (soup2.__str__()).encode("utf-8")
        return [result, lat, lng]
Example #13
0
def bootstrapify_inline_errors_in_fg(value, errors):
    if errors is None:
        return value
    errors = str(errors)
    html_str = str(value)
    form_group = Bs(html_str)
    error_list = Bs(errors)
    if error_list.ul is None:
        return value

    # Autoescape each error text
    for li_error in error_list.ul.findAll('li'):
        span_wrap = Bs("<span class=\"element\"></span>")
        span_wrap.span.append(conditional_escape(li_error.text))
        li_error.clear()
        li_error.append(span_wrap)

    wrapper = Bs("<div class=\"form-error-list\"></div>")
    wrapper.div.append(error_list)
    form_group.div.append(wrapper)
    form_group.div['class'] = " ".join(
        [form_group.div['class'], 'group-with-errors'])

    return mark_safe(form_group.__str__())
def split_html(html_filename, split_at_level=0):
    """ Split aggregated and rendered HTML document at
        some <hX> tag(s). split_at_level=0 -> split at
        H1 tags, split_at_level=1 -> split at H1 and H2
        tags.
        Returns a list of dicts with keys 'html' referring
        to the subdocument and 'level' indicating the split
        point.
    """

    destdir = os.path.dirname(html_filename)
    soup = BeautifulSoup(file(html_filename).read())
    fp = StringIO(soup.__str__(prettyPrint=True))
    docs = list()
    current_doc = list()
    for line in fp:
        line = line.rstrip()
        for level in range(split_at_level + 1):
            if '<h%d' % (level + 1) in line.lower():
                html = '\n'.join(current_doc)
                root = lxml.html.fromstring(unicode(html, 'utf-8'))
                title = u''
                h1_nodes = root.xpath('//h1')
                if h1_nodes:
                    title = h1_nodes[0].text_content().strip()

                # count tables and images
                number_tables = len(root.xpath('//table'))
                number_images = len(CSSSelector('div.image-caption')(root))

                # find all linkable nodes with an ID attribute
                node_ids = list()
                for node in root.xpath('.//*'):
                    node_id = node.get('id')
                    if node_id:
                        node_ids.append(node_id)

                html = lxml.html.tostring(root, encoding=unicode)
                docs.append(
                    dict(html=html,
                         level=level,
                         title=title,
                         node_ids=node_ids,
                         number_images=number_images,
                         number_tables=number_tables))
                current_doc = []
                break

        current_doc.append(line)

    # now deal with the remaining part of the document
    html = '\n'.join(current_doc)
    root = lxml.html.fromstring(unicode(html, 'utf-8'))
    title = u''
    h1_nodes = root.xpath('//h1')
    if h1_nodes:
        title = h1_nodes[0].text_content().strip()

    # count tables and images
    # count tables and images
    number_tables = len(root.xpath('//table'))
    number_images = len(CSSSelector('div.image-caption')(root))

    # find all linkable nodes with an ID attribute
    node_ids = list()
    for node in root.xpath('.//*'):
        node_id = node.get('id')
        if node_id:
            node_ids.append(node_id)

    html = lxml.html.tostring(root, encoding=unicode)
    docs.append(
        dict(html=html,
             level=0,
             title=title,
             node_ids=node_ids,
             number_images=number_images,
             number_tables=number_tables))

    # now store files on the filesystem
    ini_filename = os.path.join(destdir, 'documents.ini')
    fp_ini = codecs.open(ini_filename, 'w', 'utf-8')

    for count, d in enumerate(docs[1:]):
        filename = os.path.join(
            destdir, 'split-0/%d-level-%d.html' % (count, d['level']))
        if not os.path.exists(os.path.dirname(filename)):
            os.makedirs(os.path.dirname(filename))
        file(filename, 'w').write(d['html'].encode('utf-8'))

        print >> fp_ini, '[%d]' % count
        print >> fp_ini, 'filename = %s' % filename
        print >> fp_ini, 'title = %s' % d['title']
        print >> fp_ini, 'number_tables= %d' % d['number_tables']
        print >> fp_ini, 'number_images = %d' % d['number_images']
        print >> fp_ini, 'node_ids = '
        for node_id in d['node_ids']:
            print >> fp_ini, '    ' + node_id
        print >> fp_ini

    fp_ini.close()
    return docs[1:]
Example #15
0
def html2plaintext(html, body_id=None, encoding='ascii'):
    """Convert the HTML to plain text"""
    urls = []
    if body_id is not None:
        strainer = SoupStrainer(id=body_id)
    else:
        if html.count('<body'):
            strainer = SoupStrainer('body')
        strainer = None

    soup = BeautifulSoup(html, parseOnlyThese=strainer, fromEncoding=encoding)
    for link in soup.findAll('a'):
        title = unicode(link.renderContents(), encoding)
        for url in [x[1] for x in link.attrs if x[0] == 'href']:
            urls.append(dict(
                url=url,
                tag=unicode(str(link), encoding),
                title=title)
            )

    try:
        html = soup.renderContents(encoding=encoding)
    except AttributeError:
        html = soup.__str__(encoding)

    if isinstance(html, str) and encoding != 'ascii':
        html = unicode(html, encoding)

    url_index = []
    i = 0
    for d in urls:
        if d['title'] == d['url'] or u'http://' + d['title'] == d['url']:
            html = html.replace(d['tag'], d['url'])
        else:
            i += 1
            html = html.replace(d['tag'], u'%s [%s]' % (d['title'], i))
            url_index.append(d['url'])

    html = html.replace('<strong>', '*').replace('</strong>', '*')
    html = html.replace('<b>', '*').replace('</b>', '*')
    html = html.replace('<h3>', '*').replace('</h3>', '*')
    html = html.replace('<h2>', '**').replace('</h2>', '**')
    html = html.replace('<h1>', '**').replace('</h1>', '**')
    html = html.replace('<em>', '/').replace('</em>', '/')

    html = html.replace('\n', ' ')
    html = html.replace('<br>', '\n')
    html = html.replace('&nbsp;', ' ')
    html = html.replace('</p>', '\n\n')
    html = html.replace('</tr>', '\n\n')
    html = re.sub('<br\s*/>', '\n', html)
    html = html.replace(' ' * 2, ' ')

    def desperate_fixer(g):
        return ' '

    html = re.sub('<.*?>', desperate_fixer, html)
    html = u'\n'.join([x.lstrip() for x in html.splitlines()])  # lstrip lines

    for i, url in enumerate(url_index):
        if i == 0:
            html += u'\n\n'
        html += u'[%s] %s\n' % (i + 1, url)

    html = unescape(html)

    return html
Example #16
0
from twill import get_browser
from twill.commands import *
from BeautifulSoup import BeautifulSoup
import html2text

b = get_browser()
b.set_agent_string('Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)')
go("http://ddahl.com")
html = b.get_html()
soup = BeautifulSoup(html)

h,txt = html2text.html2text(soup.__str__())
txt
Example #17
0
class HtmlProcessor:
  WHITESPACE_RE = re.compile(r'\s')
  # Look for </blockquote  <p>
  BAD_TAG_RE = re.compile(r'<[^>]+<', re.MULTILINE)

  def __init__(self, html, unfill=0):
    self.unfill = unfill
    html = self._ProcessRawHtml(html)
    self._soup = BeautifulSoup(html)
    if self._soup.title.contents:
      self.title = self._soup.title.contents[0]
    else:
      self.title = None

  def _ProcessRawHtml(self, html):
    new_html, count = HtmlProcessor.BAD_TAG_RE.subn('<', html)
    if count:
      print >>sys.stderr, 'Replaced %d bad tags' % count
    return new_html

  def _StubInternalAnchors(self):
    '''Replace each internal anchor with a fixed-size filepos anchor.

    Looks for every anchor with <a href="#myanchor"> and replaces that
    with <a filepos="00000000050">. Stores anchors in self._anchor_references'''
    self._anchor_references = []
    anchor_num = 0
    # anchor links
    anchorlist = self._soup.findAll('a', href=re.compile('^#'))
    # treat reference tags like a tags for TOCTOP.
    anchorlist.extend(self._soup.findAll('reference', href=re.compile('^#')))
    for anchor in anchorlist:
      self._anchor_references.append((anchor_num, anchor['href']))
      del anchor['href']
      anchor['filepos'] = '%.10d' % anchor_num
      anchor_num += 1
            
  def _ReplaceAnchorStubs(self):
    # TODO: Browsers allow extra whitespace in the href names.
    # use __str__ instead of prettify--it inserts extra spaces.
    assembled_text = self._soup.__str__('utf8')
    del self._soup # shouldn't touch this anymore
    for anchor_num, original_ref in self._anchor_references:
      ref = urllib.unquote(original_ref[1:]) # remove leading '#'
      # Find the position of ref in the utf-8 document.
      # TODO(chatham): Using regexes and looking for name= would be better.
      newpos = assembled_text.rfind(ref.encode('utf-8'))
      if newpos == -1:
        print >>sys.stderr, 'Could not find anchor "%s"' % original_ref
        continue
      newpos += len(ref) + 2  # don't point into the middle of the <a name> tag
      old_filepos = 'filepos="%.10d"' % anchor_num
      new_filepos = 'filepos="%.10d"' % newpos
      assert assembled_text.find(old_filepos) != -1
      assembled_text = assembled_text.replace(old_filepos, new_filepos, 1)
    return assembled_text

  def _FixPreTags(self):
    '''Replace <pre> tags with HTML-ified text.'''
    pres = self._soup.findAll('pre')
    for pre in pres:
      pre.replaceWith(self._FixPreContents(str(pre.contents[0])))

  def _FixPreContents(self, text):
    if self.unfill:
      line_splitter = '\n\n'
      line_joiner = '<p>'
    else:
      line_splitter = '\n'
      line_joiner = '<br>'
    lines = []
    for line in text.split(line_splitter):
      lines.append(self.WHITESPACE_RE.subn('&nbsp;', line)[0])
    return line_joiner.join(lines)

  def _RemoveUnsupported(self):
    '''Remove any tags which the kindle cannot handle.'''
    # TODO(chatham): <link> tags to script?
    unsupported_tags = ('script', 'style')
    for tag_type in unsupported_tags:
      for element in self._soup.findAll(tag_type):
        element.extract()

  def RenameAnchors(self, prefix):
    '''Rename every internal anchor to have the given prefix, then
    return the contents of the body tag.'''
    for anchor in self._soup.findAll('a', href=re.compile('^#')):
      anchor['href'] = '#' + prefix + anchor['href'][1:]
    for a in self._soup.findAll('a'):
      if a.get('name'):
        a['name'] = prefix + a['name']

    # TODO(chatham): figure out how to fix this. sometimes body comes out
    # as NoneType.
    content = []
    if self._soup.body is not None:
      content = [unicode(c) for c in self._soup.body.contents]
    return '\n'.join(content)

  def CleanHtml(self):
    # TODO(chatham): fix_html_br, fix_html
    self._RemoveUnsupported()
    self._StubInternalAnchors()
    self._FixPreTags()
    return self._ReplaceAnchorStubs()
Example #18
0
def html2plaintext(html, body_id=None, encoding='ascii'):
    """ from an HTML text, convert the HTML to plain text.
    If @body_id is provided then this is the tag where the 
    body (not necessarily <body>) starts.
    """
    urls = []
    if body_id is not None:
        strainer = SoupStrainer(id=body_id)
    else:
        strainer = SoupStrainer('body')
    
    soup = BeautifulSoup(html, parseOnlyThese=strainer, fromEncoding=encoding)
    for link in soup.findAll('a'):
        title = link.renderContents()
        for url in [x[1] for x in link.attrs if x[0]=='href']:
            urls.append(dict(url=url, tag=str(link), title=title))

    html = soup.__str__(encoding)
            
    url_index = []
    i = 0
    for d in urls:
        if d['title'] == d['url'] or 'http://'+d['title'] == d['url']:
            html = html.replace(d['tag'], d['url'])
        else:
            i += 1
            html = html.replace(d['tag'], '%s [%s]' % (d['title'], i))
            url_index.append(d['url'])

    html = html.replace('<strong>','*').replace('</strong>','*')
    html = html.replace('<b>','*').replace('</b>','*')
    html = html.replace('<h3>','*').replace('</h3>','*')
    html = html.replace('<h2>','**').replace('</h2>','**')
    html = html.replace('<h1>','**').replace('</h1>','**')
    html = html.replace('<em>','/').replace('</em>','/')
    

    # the only line breaks we respect is those of ending tags and 
    # breaks
    
    #html = html.replace('\n',' ')
    html = html.replace('<br>', '\n')
    html = html.replace('</p>', '\n\n')
    html = re.sub('<br\s*/>', '\n', html)
    html = html.replace(' ' * 2, ' ')


    # for all other tags we failed to clean up, just remove then and 
    # complain about them on the stderr
    def desperate_fixer(g):
        #print >>sys.stderr, "failed to clean up %s" % str(g.group())
        return ' '

    html = re.sub('<.*?>', desperate_fixer, html)

    # lstrip all lines
    html = '\n'.join([x.lstrip() for x in html.splitlines()])

    for i, url in enumerate(url_index):
        if i == 0:
            html += '\n\n'
        html += '[%s] %s\n' % (i+1, url)

    html = unescape(html)
    
    return html
def html2plaintext(html, body_id=None, encoding='ascii'):
    """ from an HTML text, convert the HTML to plain text.
    If @body_id is provided then this is the tag where the 
    body (not necessarily <body>) starts.
    """
    urls = []
    if body_id is not None:
        strainer = SoupStrainer(id=body_id)
    else:
        strainer = SoupStrainer('body')
    
    soup = BeautifulSoup(html, parseOnlyThese=strainer, fromEncoding=encoding)
    for link in soup.findAll('a'):
        title = link.renderContents()
        for url in [x[1] for x in link.attrs if x[0]=='href']:
            urls.append(dict(url=url, tag=str(link), title=title))

    html = soup.__str__(encoding)
            
    url_index = []
    i = 0
    for d in urls:
        if d['title'] == d['url'] or 'http://'+d['title'] == d['url']:
            html = html.replace(d['tag'], d['url'])
        else:
            i += 1
            html = html.replace(d['tag'], '%s [%s]' % (d['title'], i))
            url_index.append(d['url'])

    html = html.replace('<strong>','*').replace('</strong>','*')
    html = html.replace('<b>','*').replace('</b>','*')
    html = html.replace('<h3>','*').replace('</h3>','*')
    html = html.replace('<h2>','**').replace('</h2>','**')
    html = html.replace('<h1>','**').replace('</h1>','**')
    html = html.replace('<em>','/').replace('</em>','/')
    

    # the only line breaks we respect is those of ending tags and 
    # breaks
    
    html = html.replace('\n',' ')
    html = html.replace('<br>', '\n')
    html = html.replace('</p>', '\n\n')
    html = re.sub('<br\s*/>', '\n', html)
    html = html.replace(' ' * 2, ' ')


    # for all other tags we failed to clean up, just remove then and 
    # complain about them on the stderr
    def desperate_fixer(g):
        #print >>sys.stderr, "failed to clean up %s" % str(g.group())
        return ' '

    html = re.sub('<.*?>', desperate_fixer, html)

    # lstrip all lines
    html = '\n'.join([x.lstrip() for x in html.splitlines()])

    for i, url in enumerate(url_index):
        if i == 0:
            html += '\n\n'
        html += '[%s] %s\n' % (i+1, url)

    html = unescape(html)
    
    return html
def split_html(html_filename, split_at_level=0):
    """ Split aggregated and rendered HTML document at
        some <hX> tag(s). split_at_level=0 -> split at
        H1 tags, split_at_level=1 -> split at H1 and H2
        tags.
        Returns a list of dicts with keys 'html' referring
        to the subdocument and 'level' indicating the split
        point.
    """

    destdir = os.path.dirname(html_filename)
    soup = BeautifulSoup(file(html_filename).read())
    fp = StringIO(soup.__str__(prettyPrint=True))
    docs = list()
    current_doc = list()
    for line in fp:
        line = line.rstrip()
        for level in range(split_at_level+1):
            if '<h%d' % (level+1) in line.lower():
                html = '\n'.join(current_doc)
                root = lxml.html.fromstring(unicode(html, 'utf-8'))
                title = u''
                h1_nodes = root.xpath('//h1')
                if h1_nodes:
                    title = h1_nodes[0].text_content().strip()

                # count tables and images
                number_tables = len(root.xpath('//table'))
                number_images = len(CSSSelector('div.image-caption')(root))

                # find all linkable nodes with an ID attribute
                node_ids = list()
                for node in root.xpath('.//*'):
                    node_id = node.get('id')
                    if node_id:
                        node_ids.append(node_id)

                html = lxml.html.tostring(root, encoding=unicode)
                docs.append(dict(html=html, 
                                 level=level, 
                                 title=title, 
                                 node_ids=node_ids,
                                 number_images=number_images,
                                 number_tables=number_tables))
                current_doc = []
                break
                
        current_doc.append(line)

    # now deal with the remaining part of the document
    html = '\n'.join(current_doc)
    root = lxml.html.fromstring(unicode(html, 'utf-8'))
    title = u''
    h1_nodes = root.xpath('//h1')
    if h1_nodes:
        title = h1_nodes[0].text_content().strip()

    # count tables and images
    # count tables and images
    number_tables = len(root.xpath('//table'))
    number_images = len(CSSSelector('div.image-caption')(root))

    # find all linkable nodes with an ID attribute
    node_ids = list()
    for node in root.xpath('.//*'):
        node_id = node.get('id')
        if node_id:
            node_ids.append(node_id)

    html = lxml.html.tostring(root, encoding=unicode)
    docs.append(dict(html=html, 
                     level=0, 
                     title=title, 
                     node_ids=node_ids,
                     number_images=number_images,
                     number_tables=number_tables))

    # now store files on the filesystem
    ini_filename = os.path.join(destdir, 'documents.ini')
    fp_ini = codecs.open(ini_filename, 'w', 'utf-8')

    for count, d in enumerate(docs[1:]):
        filename = os.path.join(destdir, 'split-0/%d-level-%d.html' % (count, d['level']))
        if not os.path.exists(os.path.dirname(filename)):
            os.makedirs(os.path.dirname(filename))                
        file(filename, 'w').write(d['html'].encode('utf-8'))

        print >>fp_ini, '[%d]' % count
        print >>fp_ini, 'filename = %s' % filename
        print >>fp_ini, 'title = %s' % d['title']
        print >>fp_ini, 'number_tables= %d' % d['number_tables']
        print >>fp_ini, 'number_images = %d' % d['number_images']
        print >>fp_ini, 'node_ids = '
        for node_id in d['node_ids']:
            print >>fp_ini, '    ' + node_id
        print >>fp_ini 

    fp_ini.close()
    return docs[1:]
Example #21
0
class HtmlProcessor:
    WHITESPACE_RE = re.compile(r'\s')
    # Look for </blockquote  <p>
    BAD_TAG_RE = re.compile(r'<[^>]+<', re.MULTILINE)

    def __init__(self, html, unfill=0):
        self.unfill = unfill
        html = self._ProcessRawHtml(html)
        self._soup = BeautifulSoup(html)
        if self._soup.title.contents:
            self.title = self._soup.title.contents[0]
        else:
            self.title = None

    def _ProcessRawHtml(self, html):
        new_html, count = HtmlProcessor.BAD_TAG_RE.subn('<', html)
        if count:
            print >> sys.stderr, 'Replaced %d bad tags' % count
        return new_html

    def _StubInternalAnchors(self):
        '''Replace each internal anchor with a fixed-size filepos anchor.

    Looks for every anchor with <a href="#myanchor"> and replaces that
    with <a filepos="00000000050">. Stores anchors in self._anchor_references'''
        self._anchor_references = []
        anchor_num = 0
        # anchor links
        anchorlist = self._soup.findAll('a', href=re.compile('^#'))
        # treat reference tags like a tags for TOCTOP.
        anchorlist.extend(
            self._soup.findAll('reference', href=re.compile('^#')))
        for anchor in anchorlist:
            self._anchor_references.append((anchor_num, anchor['href']))
            del anchor['href']
            anchor['filepos'] = '%.10d' % anchor_num
            anchor_num += 1

    def _ReplaceAnchorStubs(self):
        # TODO: Browsers allow extra whitespace in the href names.
        # use __str__ instead of prettify--it inserts extra spaces.
        assembled_text = self._soup.__str__('utf8')
        del self._soup  # shouldn't touch this anymore
        for anchor_num, original_ref in self._anchor_references:
            ref = urllib.unquote(original_ref[1:])  # remove leading '#'
            # Find the position of ref in the utf-8 document.
            # TODO(chatham): Using regexes and looking for name= would be better.
            newpos = assembled_text.rfind(ref.encode('utf-8'))
            if newpos == -1:
                print >> sys.stderr, 'Could not find anchor "%s"' % original_ref
                continue
            newpos += len(
                ref) + 2  # don't point into the middle of the <a name> tag
            old_filepos = 'filepos="%.10d"' % anchor_num
            new_filepos = 'filepos="%.10d"' % newpos
            assert assembled_text.find(old_filepos) != -1
            assembled_text = assembled_text.replace(old_filepos, new_filepos,
                                                    1)
        return assembled_text

    def _FixPreTags(self):
        '''Replace <pre> tags with HTML-ified text.'''
        pres = self._soup.findAll('pre')
        for pre in pres:
            pre.replaceWith(self._FixPreContents(unicode(pre.contents[0])))

    def _FixPreContents(self, text):
        if self.unfill:
            line_splitter = '\n\n'
            line_joiner = '<p>'
        else:
            line_splitter = '\n'
            line_joiner = '<br>'
        lines = []
        for line in text.split(line_splitter):
            lines.append(self.WHITESPACE_RE.subn('&nbsp;', line)[0])
        return line_joiner.join(lines)

    def _RemoveUnsupported(self):
        '''Remove any tags which the kindle cannot handle.'''
        # TODO(chatham): <link> tags to script?
        unsupported_tags = ('script', 'style')
        for tag_type in unsupported_tags:
            for element in self._soup.findAll(tag_type):
                element.extract()

    def RenameAnchors(self, prefix):
        '''Rename every internal anchor to have the given prefix, then
    return the contents of the body tag.'''
        for anchor in self._soup.findAll('a', href=re.compile('^#')):
            anchor['href'] = '#' + prefix + anchor['href'][1:]
        for a in self._soup.findAll('a'):
            if a.get('name'):
                a['name'] = prefix + a['name']

        # TODO(chatham): figure out how to fix this. sometimes body comes out
        # as NoneType.
        content = []
        if self._soup.body is not None:
            content = [unicode(c) for c in self._soup.body.contents]
        return '\n'.join(content)

    def CleanHtml(self):
        # TODO(chatham): fix_html_br, fix_html
        self._RemoveUnsupported()
        self._StubInternalAnchors()
        self._FixPreTags()
        return self._ReplaceAnchorStubs()
def html2plaintext(html, body_id=None, encoding='utf8', width=80):
    """ from an HTML text, convert the HTML to plain text.
    If @body_id is provided then this is the tag where the 
    body (not necessarily <body>) starts.
    """
    if encoding == 'utf8':
        from django.utils.safestring import SafeUnicode
        html = SafeUnicode(html)
        from django.utils.encoding import force_unicode
        html = force_unicode(html)
        html = html.encode('ascii', 'xmlcharrefreplace')
    urls = []
    if body_id is not None:
        strainer = SoupStrainer(id=body_id)
    else:
        strainer = SoupStrainer('body')
    
    soup = BeautifulSoup(html, parseOnlyThese=strainer, fromEncoding=encoding)
    for link in soup.findAll('a'):
        title = link.renderContents()
        for url in [x[1] for x in link.attrs if x[0]=='href']:
            urls.append(dict(url=str(url), tag=str(link), title=str(title)))

    html = soup.__str__(encoding)
            
    url_index = []
    i = 0
    for d in urls:
        if d['title'] == d['url'] or 'http://'+d['title'] == d['url']:
            html = html.replace(d['tag'], d['url'])
        elif d['url'].startswith('#'): # don't show anchor content
            html = html.replace(d['tag'], '')
        else:
            i += 1
            html = html.replace(d['tag'], '%s [%s]' % (d['title'], i))
            url_index.append(d['url'])

    #html = html.replace('<strong>','*').replace('</strong>','*')
    #html = html.replace('<b>','*').replace('</b>','*')
    #html = html.replace('<h3>','*').replace('</h3>','*')
    #html = html.replace('<h2>','**').replace('</h2>','**')
    #html = html.replace('<h1>','**').replace('</h1>','**')
    #html = html.replace('<em>','/').replace('</em>','/')
    

    # the only line breaks we respect is those of ending tags and 
    # breaks
    
    html = html.replace('\n',' ')
    html = html.replace('<br>', '\n')
    html = html.replace('</p>', '\n\n')
    html = re.sub('<br\s*/>', '\n', html)
    html = html.replace('</tr>', '\n')
    #html = html.replace('</table>', '\n\n')
    html = html.replace(' ' * 2, ' ')


    # for all other tags we failed to clean up, just remove then and 
    # complain about them on the stderr
    def desperate_fixer(g):
        #print >>sys.stderr, "failed to clean up %s" % str(g.group())
        return ' '

    html = re.sub('<.*?>', desperate_fixer, html)

    # lstrip all lines
    html = '\n'.join([x.lstrip() for x in html.splitlines()])

    for i, url in enumerate(url_index):
        if i == 0:
            html += '\n\n'
        html += '[%s] %s\n' % (i+1, url)

    html = unescape(html)
    
    # reduce consecutive empty lines to one
    pat = re.compile(r'(\n\s*\n)+', re.M)
    html = pat.sub('\n\n', html)

    # wrap long lines
    #html = word_wrap(html, width)
    # Use the python TextWrapper instead of the builtin function
    wrapper = TextWrapper(width=80)

    html = wrapper.fill(html)

    return html