Ejemplo n.º 1
1
def clean(soup, toc, ref):
    # Rewrite links
    for link in soup.findAll('a'):
        href = link.get('href')
        if href is None:
            print >> sys.stderr, "WARNING: Link with no href:", link
            continue
        if href.startswith('#') and href != '#':
            href = href[1:]
            if soup.find(attrs={'id': href}) is not None or soup.find(
                    "a", attrs={'name': href}) is not None:
                # Link to an element in the page
                continue
            target = toc.walk_id(href)
            if target is None:
                print >> sys.stderr, "WARNING: Link to an unknown ToC entry \"%s\"" % href
                continue
            link['href'] = target.link(ref)
    # Access elements by id to keep a reference before removing their id attribute
    headerElmt = soup.find("div", attrs={'id': 'header'})
    tocElmt = soup.find("div", attrs={'id': 'toc'})
    footerElmt = soup.find("div", attrs={'id': 'footer'})
    # Prefer class to id
    for id in [
            'header', 'toc', 'toctitle', 'preamble', 'content', 'footer',
            'footer-text'
    ]:
        elmt = soup.find(attrs={'id': id})
        if elmt is not None:
            elmt['class'] = (elmt.get('class', '') + ' ' + elmt['id']).strip()
            del elmt['id']
    # Add icon in header
    if headerElmt is not None:
        iconElmt = BeautifulSoup.Tag(soup,
                                     'div',
                                     attrs={'class': 'page-badge'})
        headerElmt.insert(0, iconElmt)
    # Add breadcrumb in header
    if headerElmt is not None:
        breadcrumbElmt = BeautifulSoup.Tag(soup,
                                           'div',
                                           attrs={'class': 'breadcrumb'})
        for i, entry in enumerate(ref.get_ancestry()[:-1]):
            if i > 0:
                breadcrumbElmt.append(BeautifulSoup.NavigableString(u' » '))
            linkElmt = BeautifulSoup.Tag(soup,
                                         'a',
                                         attrs={'href': entry.link(ref)})
            linkElmt.append(
                BeautifulSoup.NavigableString(
                    entry.title if not entry.is_root() else 'Docs'))
            breadcrumbElmt.append(linkElmt)
        headerElmt.insert(1, breadcrumbElmt)
    # Add ToC in header
    if tocElmt is not None:
        # Remove toc's noscript
        noscript = tocElmt.find("noscript")
        if noscript is not None:
            noscript.decompose()  # causes problems with subsequent soup.find()
        # Inject ToC
        tocHTMLBuffer = cStringIO.StringIO()
        ref.write_html(tocHTMLBuffer, open=ref)
        tocHTML = tocHTMLBuffer.getvalue().decode('utf-8')
        tocHTMLBuffer.close()
        if tocHTML == u'':
            # Remove ToC if empty
            tocElmt.decompose()
        else:
            tocTags = BeautifulSoup.BeautifulSoup(tocHTML)
            tocElmt.append(tocTags)
            # Use a wrapper div
            wrapper = BeautifulSoup.Tag(soup,
                                        'div',
                                        attrs={'class': 'tocwrapper'})
            tocElmt.replaceWith(wrapper)
            wrapper.append(tocElmt)
    # Add license in footer
    if footerElmt is not None:
        footerLicense = BeautifulSoup.BeautifulSoup("""
<div class="footer-license">
  Except as otherwise noted, <span xmlns:dct="http://purl.org/dc/terms/" property="dct:title">WonderPush Documentation</span> by <a xmlns:cc="http://creativecommons.org/ns#" href="http://www.wonderpush.com/docs" property="cc:attributionName" rel="cc:attributionURL">WonderPush</a> is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>,
  and code samples are licensed under the <a rel="license" href="http://www.apache.org/licenses/LICENSE-2.0">Apache 2.0 License</a>.
</div>""")
        footerElmt.insert(0, footerLicense)
    # Return just the interesting html, not the boilerplate
    rtn = soup.body.extract()
    rtn.name = 'div'
    return rtn
Ejemplo n.º 2
0
        def linkify_text_node(node):
            index = node.parent.contents.index(node)
            parent = node.parent
            string = unicode(node)

            matches = URL_RE.finditer(string)
            end_re = re.compile('\W')
            new_content = []
            o = 0
            for m in matches:
                s, e = m.span()

                # if there are no more characters after the link
                # or if the character after the link is not a 'word character'
                if e >= len(string) or end_re.match(string[e]):
                    link = BeautifulSoup.Tag(self._soup, 'a', attrs=[('href',m.group())])
                    link_text = BeautifulSoup.NavigableString(m.group())
                    link.insert(0, link_text)
                    if o < s: # BeautifulSoup can't cope when we insert an empty text node
                        previous_text = BeautifulSoup.NavigableString(string[o:s])
                        new_content.append(previous_text)
                    new_content.append(link)
                    o = e

            # Only do actual replacement if necessary
            if o > 0:
                if o < len(string):
                    final_text = BeautifulSoup.NavigableString(string[o:])
                    new_content.append(final_text)

                # replace the text node with the new text
                node.extract()
                for x in new_content:
                    parent.insert(index, x)
                    index += 1
Ejemplo n.º 3
0
        def separate_strings(current, next):
            if is_text(current):
                if is_text(next):
                    # Two strings are beside eachother, merge them!
                    next.extract()
                    s = unicode(current) + unicode(next)
                    s = BeautifulSoup.NavigableString(s)
                    current.replaceWith(s)
                    return s
                else:
                    # The current string is as big as its going to get.
                    # Check if you can split off some whitespace from
                    # the beginning.
                    p = unicode(current)
                    split = start_space.split(p)

                    if len(split) > 1 and split[1]:
                        # BeautifulSoup can't cope when we insert
                        # an empty text node.

                        par = current.parent
                        index = par.contents.index(current)
                        current.extract()

                        w = BeautifulSoup.NavigableString(" ")
                        s = BeautifulSoup.NavigableString(split[1])

                        par.insert(index, s)
                        par.insert(index, w)
            return next
Ejemplo n.º 4
0
def add_desc(soup, incr_version=0.01):
    # /description/document-info/program-used
    # /description/document-info/version
    add_if_not_exists(soup, soup.FictionBook.description,
                      ['document-info', 'version'])
    add_if_not_exists(soup, soup.FictionBook.description,
                      ['document-info', 'program-used'])
    di = soup.FictionBook.description.find('document-info', recursive=False)
    # increase version
    version = di.version
    text = version.string
    if text:
        try:
            text = float(text)
        except:
            traceback.print_exc()
        else:
            text += incr_version
            text = '%.2f' % text
            version.string.replaceWith(BeautifulSoup.NavigableString(text))
    else:
        version.insert(0, BeautifulSoup.NavigableString('0.01'))
    # add program-used
    program_used = di.find('program-used', recursive=False)
    text = program_used.string
    if text:
        text = '%s, %s' % (text, _program_name)
        program_used.string.replaceWith(BeautifulSoup.NavigableString(text))
    else:
        program_used.insert(0, BeautifulSoup.NavigableString(_program_name))
Ejemplo n.º 5
0
 def setter(self, text):
     if self.tag.string:
         self.tag.contents[0] = BeautifulSoup.NavigableString(text)
     else:
         self.tag.append(text)
                 
     self.tag.string = self.tag.contents[0]
Ejemplo n.º 6
0
    def setter(self, value):
        tag = self.doc
        
        for part in parts:
            if part == '':
                continue
            elif part == 'text()':
                if tag.string:
                    tag.contents[0] = BeautifulSoup.NavigableString(value)
                else:
                    tag.append(value)                    
                    
                tag.string = tag.contents[0]

                return
            else:
                child = tag.find(part)
                
                if not child:
                    child = BeautifulSoup.Tag(self.doc, part)
                    
                    tag.append(child)
                    
                tag = child
                
        tag.append(value)
Ejemplo n.º 7
0
 def condense_whitespace():
     # Go over every string, replacing all whitespace with a single space
     for string in self.root.findAll(text=True):
         s = unicode(string)
         s = any_space.sub(" ", s)
         s = BeautifulSoup.NavigableString(s)
         string.replaceWith(s)
Ejemplo n.º 8
0
 def makeTag(name, string=None, attrs=None):
     if attrs is not None:
         attrs = attrs.items()
     tag = bs.Tag(writer,name,attrs)
     if string is not None:
         tag.append(bs.NavigableString(string))
     return tag
Ejemplo n.º 9
0
  def GenerateHTML(self, controller, minify=False):
    soup = polymer_soup.PolymerSoup(str(self._soup))

    # Remove decl
    for x in soup.contents:
     if isinstance(x, BeautifulSoup.Declaration):
      if _IsDoctype(x):
        x.extract()

    # Remove all imports
    imports = soup.findAll('link', rel='import')
    for imp in imports:
      imp.extract()

    # Remove all script links
    scripts_external = soup.findAll('script', src=True)
    for script in scripts_external:
      script.extract()

    # Remove all inline script
    scripts_external = soup.findAll('script', src=None)
    for script in scripts_external:
      script.extract()

    # Process all inline styles
    inline_styles = soup.findAll('style')
    for style in inline_styles:
      html = controller.GetHTMLForInlineStylesheet(str(style.string))
      if html:
        ns = BeautifulSoup.Tag(soup, 'style')
        ns.append(BeautifulSoup.NavigableString(html))
        style.replaceWith(ns)
      else:
        style.extract()

    # Rewrite all external stylesheet hrefs or remove, as needed
    stylesheet_links = soup.findAll('link', rel='stylesheet')
    for stylesheet_link in stylesheet_links:
      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
      if html:
        tmp = polymer_soup.PolymerSoup(html).findChildren()
        assert len(tmp) == 1
        stylesheet_link.replaceWith(tmp[0])
      else:
        stylesheet_link.extract()

    # Remove comments if minifying.
    if minify:
      comments = soup.findAll(
          text=lambda text:isinstance(text, BeautifulSoup.Comment))
      for comment in comments:
        comment.extract()

    # We is done.
    return str(soup)
Ejemplo n.º 10
0
 def encode_xml_specials(self) :
     """
     BeautifulSoup will let some dangerous xml entities hang around
     in the navigable strings. destroy all monsters.
     >>> c = Cleaner(auto_clean=True, encode_xml_specials=True)
     >>> c('<<<<<')
     u'&lt;&lt;&lt;&lt;'
     """
     for string in self.root.findAll(text=True):
         s = unicode(string)
         s = encode_xhtml_entities(s)
         s = BeautifulSoup.NavigableString(s)
         string.replaceWith(s)
Ejemplo n.º 11
0
 def createTextNode(self, data):
     return Text(self, BeautifulSoup.NavigableString(data))
Ejemplo n.º 12
0
def main():
    global homes
    global page_num

    print("reading pages...")
    page_num = 17
    while page_num < 600:

        curr_home = {}

        print("page  " + str(page_num))
        page_one = BeautifulSoup.BeautifulSoup(open("page%03i.html" %
                                                    page_num))
        #  ** special case fix for page 329, a typo for the 'Beds and Rooms' **
        if page_num == 329:
            bad_span = page_one.find(text=re.compile("951"))
            bad_span.string.replaceWith("Beds and Rooms")

        process_page_one(page_one, curr_home)

        print("page  " + str(page_num + 1))
        page_two = BeautifulSoup.BeautifulSoup(
            open("page%03i.html" % (page_num + 1)))
        # ** special case fixes for page 98 and 190, these pages have a blank
        # space rather than a value of 'N/A" in the field. We "fix" this by injecting
        # a value into the parsed page because it's cleaner that putting a
        # special case check in the processing code
        if page_num + 1 == 98:
            new_div = BeautifulSoup.Tag(page_two, "div")
            new_div[
                "style"] = "top: 228px; left:481px; height:10px; width:10px;"
            new_span = BeautifulSoup.Tag(page_two, "span")
            new_span.insert(0, BeautifulSoup.NavigableString("N/A"))
            new_div.insert(0, new_span)
            page_two.html.body.insert(0, new_div)
        elif page_num + 1 == 190:
            new_div = BeautifulSoup.Tag(page_two, "div")
            new_div[
                "style"] = "top: 232px; left:481px; height:10px; width:10px;"
            new_span = BeautifulSoup.Tag(page_two, "span")
            new_span.insert(0, BeautifulSoup.NavigableString("N/A"))
            new_div.insert(0, new_span)
            page_two.html.body.insert(0, new_div)
        elif page_num + 1 == 350:
            new_div = BeautifulSoup.Tag(page_two, "div")
            new_div[
                "style"] = "top: 111px; left:249px; height:10px; width:10px;"
            new_span = BeautifulSoup.Tag(page_two, "span")
            new_span.insert(0, BeautifulSoup.NavigableString("N/A"))
            new_div.insert(0, new_span)
            page_two.html.body.insert(0, new_div)
            new_div = BeautifulSoup.Tag(page_two, "div")
            new_div[
                "style"] = "top: 111px; left:518px; height:10px; width:10px;"
            new_span = BeautifulSoup.Tag(page_two, "span")
            new_span.insert(0, BeautifulSoup.NavigableString("N/A"))
            new_div.insert(0, new_span)
            page_two.html.body.insert(0, new_div)

        if page_num + 1 == 292:  #handle a typo for Little Mountain Place where the Care Services has BC instead of BC Avg in a subheader
            bad_span = page_two.find(text=re.compile("BC$"))
            bad_span.string.replaceWith("BC Avg")

        process_page_two(page_two, curr_home)

        homes.append(curr_home)
        page_num += 2

    # save as a json file
    print("writing...")
    outfile = open("homes.json", "w")
    outfile.write(json.dumps(
        homes,
        indent=4,
        sort_keys=True,
    ))
    outfile.close()
Ejemplo n.º 13
0
        "xcodebuild -exportArchive -exportFormat APP -archivePath ~/Desktop/Flashlight.xcarchive -exportPath ~/Desktop/Flashlight.app"
    )

    os.system("""pushd ~/Desktop
	zip -r Flashlight.zip Flashlight.app
	popd""")

signature = subprocess.check_output([
    "sh", "../Flashlight signing/sign_update.sh",
    os.path.expanduser("~/Desktop/Flashlight.zip"),
    "../Flashlight signing/dsa_priv.pem"
]).strip()

import BeautifulSoup as bs
soup = bs.BeautifulSoup(open("Appcast.xml").read())
c = soup.find("channel")
item = c.find("item")
new_item = bs.BeautifulSoup(str(item))
new_item.find("title").contents = [bs.NavigableString("Version " + v)]
new_item.find("sparkle:releasenoteslink").contents = [
    bs.NavigableString("http://flashlightupdates.42pag.es/" + v)
]
enc = new_item.find("enclosure")
enc['sparkle:version'] = vn
enc['sparkle:dsasignature'] = signature
enc['url'] = "https://github.com/nate-parrott/Flashlight/releases/download/v{0}/Flashlight.zip".format(
    v)
enc['sparkle:shortversionstring'] = v
c.insert(c.contents.index(item), new_item)
open("Appcast.xml", "w").write(str(soup))
Ejemplo n.º 14
0
 documentContents = weoSoup.kml.document.contents
 for tag in documentContents:
     try:
         if tag.name == 'name':
             nameString = tag.string
             break
     except:
         pass
 overlayString = str(weoSoup.find('groundoverlay'))
 overlaySoup = BeautifulSoup.BeautifulSoup(overlayString)
 overlayContents = overlaySoup.contents
 for tag in overlayContents:
     if tag.name == 'name':
         tag.replaceWith('<name>' + nameString + '</name>')
 overlaySoup.groundoverlay['id'] = fileName
 visibilityTag = BeautifulSoup.NavigableString(
     '<visibility>0</visibility>')
 overlaySoup.groundoverlay.insert(0, visibilityTag)
 kmlOverlayString += str(overlaySoup).replace(
     'groundoverlay',
     'GroundOverlay').replace('latlonbox',
                              'LatLonBox').replace('icon', 'Icon')
 Folders = weoSoup.kml.document.findAll('folder')
 for folder in Folders:
     try:
         Coords = folder.placemark.point.coordinates.contents[0].split(
             ',')
     except:
         pass
     try:
         linRingCoords = folder.placemark.polygon.outerboundaryis.linearring.coordinates.contents[
             0].split(' ')[0]
Ejemplo n.º 15
0
def render_hashed(request, key, user, extracontext={}):
    ###Need to get all of the rendered html
    ###and integrate via Beautiful
    if 'TYPE' in extracontext:
        htmlrender = extracontext['TYPE'] == 'HTML'
    else:
        htmlrender = 'JS'
    if key is None:
        key = request.META['PATH_INFO']
    empty = True
    if user is None:
        user = request.user
    retdict = get_cache_or_render(user,
                                  key,
                                  empty,
                                  forcerender=True,
                                  request=request,
                                  extracontext=extracontext)
    rendered_list = retdict['rendered_list']
    ret = defaultdict(list)
    for i in rendered_list:
        for k, v in i.items():
            if k != 'html': print k + ' ' + str(v)
        if type(i['html']) == ListType:
            for v, k in i['html']:
                soup = BeautifulSoup.BeautifulSoup(v)
                if i['type'] == 'html':
                    ret[i['div']] = [soup]
                elif i['type'] == 'append':
                    ret[i['div']].append(soup)
        else:
            #print i['div'] + ' : ' + i['type']
            soup = BeautifulSoup.BeautifulSoup(i['html'])
            if '#pages' in ret and i['div'] != '#tab_ruler' and i[
                    'type'] != 'html':
                text = ret['#pages'][0].find('div', {'id': i['div'][1:]})
                #print text
                if text is not None:
                    text.insert(0, BeautifulSoup.NavigableString(i['html']))
            elif i['type'] == 'html':
                ret[i['div']] = [soup]
            elif i['type'] == 'append':
                ret[i['div']].append(soup)
            elif i['type'] == 'prepend':
                ret[i['div']].insert(0, soup)
            else:
                print i['type']
                text = ret[i['type']][len(ret[i['type']]) - 1].find(
                    'div', {'id': i['div'][1:]})
                #print text
                if text is not None:
                    text.insert(0, BeautifulSoup.NavigableString(i['html']))
    rendertype = retdict['rendertype']
    final = {}
    for k, v in ret.items():
        r = ''
        for val in v:
            r += val.prettify()
        if htmlrender:
            final[k[1:]] = r
        else:
            final[k] = r
    return {
        'renders': final,
        'object': retdict['object'],
        'rendertype': rendertype,
        'counts': retdict['counts']
    }
Ejemplo n.º 16
-1
        def reassign_whitespace():
            strings = self.root.findAll(text=True)
            i = len(strings) - 1

            after = None
            while i >= 0:
                current = strings[i]
                if is_text(after) and not after.strip():
                    # if 'after' holds only whitespace,
                    # remove it, and append it to 'current'
                    s = unicode(current) + unicode(after)
                    s = BeautifulSoup.NavigableString(s)
                    current.replaceWith(s)
                    after.extract()

                    current = s

                after = current
                i -= 1