Example #1
0
def sanitize_html(value):
    from BeautifulSoup import BeautifulSoup, Comment, Tag

    # FIXME: 'None' should never be saved as text
    if value is None:
        return ""

    # allowed tags for a Vodafone Live <CONTAINER type="data" />
    # this doubles up as a translation table. CKEditor does new-ish
    # HTML than Vodafone Live will accept. We have to translate 'em' back
    # to 'i', and 'strong' back to 'b'.
    #
    # NOTE: Order is important since <strong>'s can be inside <p>'s.
    tags = (
        ("em", "i"),  # when creating them in the editor they're EMs
        ("strong", "b"),
        ("i", "i"),  # when loading them as I's the editor leaves them
        ("b", "b"),  # we keep them here to prevent them from being removed
        ("u", "u"),
        ("br", "br"),
        ("p", "p"),
    )
    valid_tags = [tag for tag, replacement_tag in tags]
    soup = BeautifulSoup(value)

    # remove all comments from the HTML
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()

    # hide all tags that aren't in the allowed list, but keep
    # their contents
    for tag in soup.findAll(True):
        # Vodafone Live allows for no tag attributes
        tag.attrs = []
        if tag.name not in valid_tags:
            tag.hidden = True

    # replace tags with Vlive equivelants
    for element, replacement_element in tags:
        if element is not replacement_element:
            for tag in soup.findAll(element):
                replacement_tag = Tag(soup, replacement_element)
                replacement_tag.insert(0, tag.text)
                tag.replaceWith(replacement_tag)

    xml = soup.renderContents().decode("utf8")
    fragment, errors = tidy_fragment(xml, {"char-encoding": "utf8"})

    return (
        fragment.replace("&nbsp;", " ")
        .replace("&rsquo;", "'")
        .replace("&lsquo;", "'")
        .replace("&quot;", '"')
        .replace("&ldquo;", '"')
        .replace("&rdquo;", '"')
        .replace("&bull;", "- ")
        .replace("&eacute;", "e")
        .replace("&Eacute;", "E")
        .replace("&ndash;", "-")
    )
Example #2
0
def do_iperimage(value):

    """detects iPernity static urls and creates clickable thumbnail for it"""

    soup = BeautifulSoup(value)
    iprl = re.compile("^(http://\w+\.ipernity\.com/\d+/\d+/\d+/\d+\.\w+\.)(75|100|240|500|560)(\.jpg)$")
    iprl_thumb = "500"
    iprl_zoom = "560"

    for img in soup.findAll("img", src=iprl):

        match = iprl.match(img["src"])
        try:
            thumb = Tag(soup, "img")
            thumb["alt"] = img["title"]
            thumb["src"] = match.group(1) + iprl_thumb + match.group(3)

            link = Tag(soup, "a")
            link["href"] = match.group(1) + iprl_zoom + match.group(3)
            link["rel"] = "lightbox"
            link["title"] = img["title"]
            link.insert(0, thumb)

            img.replaceWith(link)
        except:
            pass

    return unicode(soup)
Example #3
0
def makeHTMLQuestion(fn, htmldata):
  soup = BeautifulSoup(htmldata)
  #add JS
  soup.find('body')['onload'] = "populateAssignmentID('myAssignmentId')"
  scripttag = Tag(soup, "script")
  scripttag['type'] = "text/javascript"
  scripttag.string = "__SUBMIT_JS__"
  soup.find('head').insert(0, scripttag)
  #replace forms
  forms = soup.findAll('form')
  if forms:
    for form in forms:
      if not form.has_key('method'):
        form['method'] = 'POST'
      if not form.has_key('action'):
        if testmode:
          form['action'] = 'https://workersandbox.mturk.com/mturk/externalSubmit'
        else:
          form['action'] = 'https://www.mturk.com/mturk/externalSubmit'
      if not form.has_key('onSubmit'):
        form['onSubmit'] = "return verifyTurkSubmit('myAssignmentId');"
      inputtag = Tag(soup,'input')
      inputtag['type'] = 'hidden'
      inputtag['name'] = 'assignmentId'
      inputtag['id'] = 'myAssignmentId'
      inputtag['value'] = ''
      form.insert(0, inputtag)
  html = str(soup).replace("__SUBMIT_JS__", SUBMIT_JS)
  mainurl = uploadfile(fn, html)
  for sub in soup.findAll('img'):
    # TODO
    fn = dirname(fn) + '/' + sub['src']
    uploadfile(fn)
  return ExternalQuestion(escape(mainurl), frame_height)
Example #4
0
	def convertStoreFormat(self):
		"""
		convert legacy to canonical store format

		N.B.:
		While the new canonical store format was introduced in
		TiddlyWiki v2.2 final, various v2.2 beta releases are still
		using the legacy store format.

		@return: None
		"""
		try:
			version = self.getVersion()
		except (ValueError, AttributeError):
			version = (0, 0, 0) # assume pre-v2.2 format
		if version and (version[0] + (version[1] / 10.0) < 2.3): # N.B.: addition works because all pre-v2.3 releases are known -- XXX: actual threshold is v2.2 final
			for tiddler in self.store.findChildren("div", tiddler = True):
				# convert tiddler attribute to title attribute
				tiddler["title"] = tiddler["tiddler"]
				del(tiddler["tiddler"])
				# decode tiddler contents
				tiddler.contents[0].replaceWith(decodeTiddlerText(tiddler.contents[0])) # XXX: use of contents[0] hacky?
				# add PRE wrapper
				pre = Tag(self.dom, "pre")
				pre.contents = tiddler.contents
				tiddler.contents = [pre]
Example #5
0
 def __init__(self):
     self.tagStack = []
     self.currentData = ''
     self.currentTag = None
     self.pushTag(self)
     ContentHandler.__init__(self)
     Tag.__init__(self, '[document]')
   def savePDF(self, pdf_filename, parent_soup, target_node, yes_phrase, url, key, school_name):
       if target_node:
          grandparent_node = target_node.parent.parent
          tag = self.highlightedNode(target_node, yes_phrase, parent_soup)
          self.replaceNode(target_node, tag)
          body = Tag(parent_soup,"body")
          body.append(grandparent_node)
       else:
          body = parent_soup
       try:
          weasyprint = HTML(string=body.prettify())
          tmp_filename = 'pdfs/tmp.pdf'
          weasyprint.write_pdf(tmp_filename,stylesheets=[CSS(string='body { font-size: 10px; font-family: serif !important }')])
       except:
          print "weasyprint failed on url: "+url
          if target_node:
             self.replaceNode(tag, target_node) #return to old state
          return

       if target_node:
          self.replaceNode(tag, target_node) #return to old state

       sep_filename = "pdfs/sep.pdf"
       self.makeSepPage(sep_filename, url, key, school_name)

       merger = PdfFileMerger()
       if (os.path.exists(pdf_filename)):
           merger.append(PdfFileReader(file(pdf_filename, 'rb')))
       merger.append(PdfFileReader(file(sep_filename, 'rb')))
       merger.append(PdfFileReader(file(tmp_filename, 'rb')))
       merger.write(pdf_filename)
Example #7
0
def clean_html_style(data, element, remove_comments=True, remove_empty=True):
    """removes the style information associated with html element

    >>> t = '<!--  /* Style Definitions */ table.MsoNormalTable	{mso-style-name:"Table Normal";	mso-tstyle-rowband-size:0;	mso-tstyle-colband-size:0;	mso-style-noshow:yes;	mso-style-priority:99;	mso-style-qformat:yes;	mso-style-parent:"";	mso-padding-alt:0in 5.4pt 0in 5.4pt;	mso-para-margin-top:0in;	mso-para-margin-right:0in;	mso-para-margin-bottom:10.0pt;	mso-para-margin-left:0in;	line-height:115%;	mso-pagination:widow-orphan;	font-size:11.0pt;	font-family:"Calibri","sans-serif";	mso-ascii-font-family:Calibri;	mso-ascii-theme-font:minor-latin;	mso-hansi-font-family:Calibri;	mso-hansi-theme-font:minor-latin;} --><p>  </p><p class="MsoNormal" style="margin-bottom: 0.0001pt; line-height: normal;">New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p class="MsoNormal" style="margin-bottom: 0.0001pt; line-height: normal;">?</p><p class="MsoNormal" style="margin-bottom: 0.0001pt; line-height: normal;">According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>'
    >>> clean_html_style(t, 'p')
    '<p>New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p>?</p><p>According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>'
    >>> clean_html_style(t, 'p', remove_empty=False)
    '<p> </p><p>New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p>?</p><p>According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>'
    >>> clean_html_style(t, 'p', remove_comments=False)
    '<!--  /* Style Definitions */ table.MsoNormalTable\t{mso-style-name:"Table Normal";\tmso-tstyle-rowband-size:0;\tmso-tstyle-colband-size:0;\tmso-style-noshow:yes;\tmso-style-priority:99;\tmso-style-qformat:yes;\tmso-style-parent:"";\tmso-padding-alt:0in 5.4pt 0in 5.4pt;\tmso-para-margin-top:0in;\tmso-para-margin-right:0in;\tmso-para-margin-bottom:10.0pt;\tmso-para-margin-left:0in;\tline-height:115%;\tmso-pagination:widow-orphan;\tfont-size:11.0pt;\tfont-family:"Calibri","sans-serif";\tmso-ascii-font-family:Calibri;\tmso-ascii-theme-font:minor-latin;\tmso-hansi-font-family:Calibri;\tmso-hansi-theme-font:minor-latin;} --><p>New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p>?</p><p>According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>'
    """
    try:
        soup = BeautifulSoup(data)
    except:
        soup = BeautifulSoup(data)
    # remove all comments in this html block
    if remove_comments:
        comments = soup.findAll(text=lambda text: isinstance(text, Comment))
        [comment.extract() for comment in comments]

    # remove all occurences of tags like sup, script
    [i.extract() for i in soup.findAll(re.compile('sup|script'))]

    # find all occurences of the "element" tag
    for i in soup.findAll(element):
        text = i.renderContents().strip()
        if text:
            new_tag = Tag(soup, element)
            new_tag.insert(0, text)
            i.replaceWith(new_tag)
        elif remove_empty:
            i.extract()
    return smart_unicode(soup.renderContents())
Example #8
0
    def _set_element(self, root, tagname, text=None, attr=None):
        """Creates if not available an element at the soup root element
        
        :return: tag object or None
        :rtype: Tag
        """

        # Add Topic if not available
        if attr is None:
            if root.find(re.compile(tagname + "$", re.I)) is None:
                new_tag = Tag(self._soup, tagname)
                root.insert(0, new_tag)
        else:
            if root.find(re.compile(tagname + "$", re.I), attr) is None:
                new_tag = Tag(self._soup, tagname, attr.items())
                root.insert(0, new_tag)

        settings = self._soup.find(self.root)
        tag = settings.find(re.compile(tagname + "$", re.I))

        # Something to insert
        if tag is not None and text is not None:
            if tag.text.strip() == "":
                tag.insert(0, NavigableString(text))
            else:
                tag.contents[0].replaceWith(text)

        return tag
    def unTag(self, tag):
        """
            recursively removes unwanted tags according to defined lists
            @param tag: tag hierarchy to work on
        """
        for child in tag.findChildren(True, recursive=False):
            self.unTag(child)
        if (self.remove_classes_regexp != "") and (tag.has_key("class") and (re.match(self.remove_classes_regexp, tag["class"]) != None)):
            tag.extract()
        elif tag.name in self.keep_tags:
            new_tag = Tag(self.input, tag.name)
            new_tag.contents = tag.contents
            tag.replaceWith(new_tag)

        elif tag.name in self.remove_tags_keep_content:            
            children = tag.findChildren(True, recursive=False)
            if len(children)==1:
                tag.replaceWith(children[0])
            elif len(children) > 1:
                new_tag = Tag(self.input, "p")
                for child in tag.findChildren(True, recursive=False):
                    new_tag.append(child)
                tag.replaceWith(new_tag)
            else:
                tag.replaceWith(tag.renderContents())
        else:
            tag.extract()        
Example #10
0
def rewriteLinksSection(dom, soup, links_table):
    links = []
    for fnt in links_table.findAll('font', {'size': '2', 'face':'verdana'}):
        if str(fnt).startswith('<font size="2" face="verdana"><a href="'):
            link = fnt.find('a')
            
            caption = link.getText('').strip()
            if caption.endswith(' Translation') and OMIT_TRANSLATIONS:
                continue
            
            links.append((link['href'], caption))
    
    links_table.decompose()
    
    if not INCLUDE_LINKS or len(links) == 0:
        return
    
    b = Tag(soup, 'b')
    b.string = 'Links'
    dom.append(b)
    
    ul = Tag(soup, 'ul')
    for url, caption in links:
        li = Tag(soup, 'li')
        a = Tag(soup, 'a', {'href': url})
        a.string = caption
        li.append(a)
        ul.append(li)
    
    dom.append(ul)
Example #11
0
 def _linkify_headings(self, soup):
     md_el = soup.find("div", "md")
     for heading in md_el.findAll(["h1", "h2", "h3"], recursive=False):
         heading_a = Tag(soup, "a", [("href", "#%s" % heading["id"])])
         heading_a.contents = heading.contents
         heading.contents = []
         heading.append(heading_a)
def get_last_3(soup, table):
    loop = 0
    first = 1
    enclose = Tag(soup, "div")
    ul = Tag(soup, "ul")
    for tr in table.findAll("tr"):
        td = tr.findAll("td")
        li = Tag(soup, "li")
        for el in td[3:]:
            if loop != 3:
                try:
                    text = ''.join(el.findAll(text=True))
                    text = text.strip()
                    if text != '' and text != '&nbsp;':
                        el.name = "span"
                        if loop != 2: el.append(' - ')
                        li.append(el)
                except:
                    pass
            else:
                break
            loop += 1
        loop = 0
        if ''.join(li.findAll(text=True)) != '':
            enclose.append(li)
    return enclose
Example #13
0
 def __init__(self):
     self.tagStack = []
     self.currentData = ''
     self.currentTag = None
     self.pushTag(self)
     ContentHandler.__init__(self)
     Tag.__init__(self, '[document]')
Example #14
0
def do_iperimage(value):
    '''detects iPernity static urls and creates clickable thumbnail for it'''

    soup = BeautifulSoup(value)
    iprl = re.compile(
        '^(http://\w+\.ipernity\.com/\d+/\d+/\d+/\d+\.\w+\.)(75|100|240|500|560)(\.jpg)$'
    )
    iprl_thumb = '500'
    iprl_zoom = '560'

    for img in soup.findAll('img', src=iprl):

        match = iprl.match(img['src'])
        try:
            thumb = Tag(soup, 'img')
            thumb['alt'] = img['title']
            thumb['src'] = match.group(1) + iprl_thumb + match.group(3)

            link = Tag(soup, 'a')
            link['href'] = match.group(1) + iprl_zoom + match.group(3)
            link['rel'] = 'lightbox'
            link['title'] = img['title']
            link.insert(0, thumb)

            img.replaceWith(link)
        except:
            pass

    return unicode(soup)
Example #15
0
   def outputData(self, outfile):

      outSoup = BeautifulStoneSoup("", selfClosingTags=["path"])
      outRoot = Tag(outSoup, "svg")
      outRoot["xmlns"] = "http://www.w3.org/2000/svg"
      outRoot["width"] = self.width
      outRoot["height"] = self.height
      outRoot["version"] = 1.1

      outSoup.insert(0, outRoot)


      for char in reversed(self._soup.findAll("char")):
         path = Tag(outSoup, "path")
         path["d"] = char["d"]
         path["style"] = self.style
         outRoot.insert(0, path)


      svg_header = "<?xml version=\"1.0\" standalone=\"no\"?>\n"
      svg_header += "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\""
      svg_header += " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n"

      self.scaleAndGridAlphabet(outSoup)

      outfile = open(outfile, "w")

      outfile.write(svg_header +  outSoup.prettify())

      outfile.close()
Example #16
0
def make_links_readable(html):
    """
    Goes through links making them readable
    If they are too long, they are turned into goo.gl links
    timing stats:
    before multiprocess = 0m18.063s
    """
    soup = BeautifulSoup(html)
    for link in soup.findAll('a'):#links:
        oldlink = link
        if link and len(link.get('href', '')) > 90 and options.use_short_links:
            #make into goo.gl link
            short_link = shorten_link(soup, link)
            if short_link != None:
                link = short_link

        if validate_link(link) and link.get('href', None):
            if not link.text:
                oldlink.replaceWith(link.get('href', "No href link to replace with"))
            else:
                div = Tag(soup, 'div')
                div.setString(link.text)
                br = Tag(soup, 'br')
                new_link = Tag(soup, 'a')
                new_link.setString("(%s)" % (link.get('href')) )
                div.append(br)
                div.append(new_link)
                oldlink.replaceWith(div)
            print

    return soup
Example #17
0
 def _linkify_headings(self, soup):
     md_el = soup.find('div', 'md')
     for heading in md_el.findAll(['h1', 'h2', 'h3'], recursive=False):
         heading_a = Tag(soup, "a", [('href', '#%s' % heading['id'])])
         heading_a.contents = heading.contents
         heading.contents = []
         heading.append(heading_a)
Example #18
0
def format_title_link(title, title_link):
    """Format the title header"""
    soup = BeautifulSoup('')
    tag = Tag(soup, 'a')
    tag['href'] = 'http://en.wikipedia.org/wiki/%s' % title_link
    tag.string = title
    return str(tag)
Example #19
0
    def soup_filter_zz_fold_etymology( self, content ):
        heads = content.findAll( 'h2', {'class':'head'} ) + content.findAll( 'h3', {'class':'head'} ) + content.findAll( 'h4', {'class':'head'} )
        etymologys = []
        for h in heads:
#            print "Head, ", h
            if h.next and h.next.lower().startswith('etymology'):
#                print "found", h.content[0]
                etymologys.append( h )
#                print 'Etymology found: ', h

        etymology_index = 1
        for e in etymologys:
            div = Tag( content, 'div' )
            div['id'] = u'etymology_'+str(etymology_index)
            div['style'] = u'display:none'
            linkSoup = BeautifulSoup( u''' <a href="javascript:f('%s',this)">[show]</a>''' % (div['id']) )
            e.append( linkSoup )

            paragraphs = []
            
            n = e.nextSibling
            first = 1
            while n and (n.__class__.__name__ == 'NavigableString' or  (n.__dict__.has_key('name') and n.name == 'p') ):
                paragraphs.append( n )
                n = n.nextSibling
                
            [div.append(p) for p in paragraphs]
            
            eIndex = e.parent.contents.index( e )
            e.parent.insert( eIndex + 1, div )
 
            etymology_index = etymology_index + 1
Example #20
0
def save():
	json_data = request.json
	status = False
	data={}
	with open(ret_url(json_data["doc"],"/papers"), "r+") as inf:
		txt = inf.read()
		soup = BeautifulSoup(txt)
		#Solo se e' una review faccio queste modifiche, altrimenti se e' una decisione lo inserisce direttamente nell'head
		if json_data["type"] == "review": 
			#Controllo se lo script esiste o meno, se esiste lo elimino
			for script in soup.findAll("script",{"type":"application/ld+json"}):
				data = json.loads(script.text.strip())
				if data[0]["@type"] == "review":
					if data[0]["article"]["eval"]["author"] == "mailto:"+json_data["author"]:
						script.extract()
						break
			#Rimuovo il contenuto del Body e lo riscrivo
			for section in soup.findAll("section"):
				section.extract()
			for section in json_data["sections"]:
				beauty = BeautifulSoup(section)
				soup.body.insert(len(soup.body.contents), beauty)
		#Creo lo script e lo inserisco
		new = Tag(soup, "script")
		new.attrs.append(("type", "application/ld+json"))
		new.string = json.dumps(json_data["script"])
		soup.head.insert(len(soup.head.contents), new)
		#Salvo il file
		html = soup.prettify("utf_8")
		inf.seek(0)
		inf.write(html)
		inf.truncate()
		inf.close()
	status=True 
	return jsonify({"result": status})
Example #21
0
 def fix_heading(heading, tags):
     '''
     Remove paragraphs with no strings.
     Remove non-special headings that don't start with a paragraph.
     Remove lists from non-special headings.
     '''
     SPECIAL = ['Books', 'Works', 'Bibliography', 'External links',
                'Further reading']
     tags = [tag for tag in tags if tag is not None and
                 tag.name!='p' or tag.renderContents(None).strip()]
     special = False
     heading_text = tagtext(heading)
     for word in SPECIAL:
         if word.lower() in heading_text.lower():
             special = True
     if heading_text == 'External links and references':
         set_heading_text(heading, 'External links')
     # Shorten lists (even special ones).
     # The motivation is that some pages like to list reams of crap,
     # usually in bibliographies, but in other things too.
     found_lis = 0
     MAX_ITEMS = 10  # per headed section
     for tag in list(tags):
         if tag.name in ('ul', 'ol'):
             for li in tag.findAll('li', recursive=False):
                 found_lis += 1
                 if found_lis > MAX_ITEMS:
                     li.extract()
     # Remove any now-empty uls and ols.
     # Harder than it sounds, due to nested lists.
     temp = Tag(soup, 'p')
     for tag in tags:
         temp.append(tag)
     for tag in temp.findAll(('ul', 'ol')):
         if not tag.findAll(('ul', 'ol', 'li')):
             tag.extract()
     tags = temp.contents
     if found_lis > MAX_ITEMS:
         # Add " (some omitted)" to heading
         if heading_text:
             heading_text = heading_text.replace(' (incomplete)', '')
             if context['srcurl'].startswith('http:'):
                 heading_text += ' (some <a href="%s">omitted</a>)' % context['srcurl']
             else:
                 heading_text += ' (some omitted)'  # no "relative" links
             set_heading_text(heading, heading_text)
     if not special:
         if heading is not None:
             # Remove non-special headings which don't start with a paragraph.
             if not tags or tags[0].name != 'p':
                 return drop_heading(heading)
             # Remove non-special headings containing lists.
             for tag in tags:
                 if tag.name in ('ul', 'ol'):
                     return drop_heading(heading)
         else:
             # Remove lists from None (before first heading, if any).
             tags = [tag for tag in tags if tag.name not in ('ul', 'ol')]
     return (heading, tags)
Example #22
0
 def _number_sections(self, soup):
     count = 1
     for para in soup.find("div", "md").findAll(["p"], recursive=False):
         a = Tag(soup, "a", [("class", "p-anchor"), ("id", "p_%d" % count), ("href", "#p_%d" % count)])
         a.append(str(count))
         para.insert(0, a)
         para.insert(1, " ")
         count += 1
def _tag_generator(soup, name, attrs=[], contents=None):
    if attrs != []:
        new_tag = Tag(soup, name, attrs)
    else:
        new_tag = Tag(soup, name)
    if contents != None:
        new_tag.insert(0, contents)
    return new_tag
Example #24
0
def code_colorizer(entry):
    """
    Uses BeautifulSoup to find and parse the code in the entry
    that will be colorized and changes it according to the syntax
    specs using pygments.

    The HTML code should include the colorized code wrapped into a
    div which has language (e.g. python) as id and "code" as class
    attributes.

    Best part of using a filter is that we don't have to change the
    real post containing the code. The worst part is that we have to
    search for the code layer in each post.

    """

    if settings.COLORIZE_CODE:
        try:
            from BeautifulSoup import BeautifulSoup, Tag
            from pygments import highlight
            from pygments.lexers import get_lexer_by_name
            from pygments.formatters import HtmlFormatter
        except ImportError:
            return entry

        try:
            parser = BeautifulSoup(entry, convertEntities=BeautifulSoup.ALL_ENTITIES)
        except HTMLParser.HTMLParseError:
            return entry

        # searching for code blocks in the blog entry
        code_blocks = parser.findAll("div", attrs={"class": "code"})

        if len(code_blocks) > 0:
            for block in code_blocks:
                # if the code block's wrapper div doesn't have an id
                # attribute don't colorize the code
                if "id" in block.attrMap:
                    language = block.attrMap["id"]
                else:
                    continue

                # finding the exact place of the code
                layer = block.div if block.div else block
                # removing any html tags inside the code block
                [tag.extract() for tag in layer.findAll()]
                # getting the original code in the block
                code = "".join(layer.contents)
                # colorizing the code
                lexer = get_lexer_by_name(language)
                formatter = HtmlFormatter(linenos="table", style="tango", cssclass="code")
                colorized_code = Tag(parser, "div") if block.div else Tag(parser, "div", attrs=(("id", language), ("class", "code")))
                colorized_code.insert(0, highlight(code, lexer, formatter))
                layer.replaceWith(colorized_code)

            return parser.renderContents()

    return entry
Example #25
0
def neighborhood_kml(request,neighborhood):
    neighborhood = Neighborhood.objects.get(name=neighborhood)
    soup = BeautifulSoup(neighborhood.geom.kml)
    tag = Tag(soup, "extrude")
    soup.polygon.insert(0, tag )
    text = "1"
    tag.insert(0, text)
    xml = str(soup )
    return render_to_response("restaurants/kml_template.html",{'neighborhood': neighborhood,"xml": xml}, context_instance=RequestContext(request))
def linearize_cols_3(soup, table):
    if table.get('id') == "linearize-cols-3":
        div = Tag(soup, "ul")
        div["class"] = "div-container"
        ul_last = get_last_3(soup, table)
        ul_first = get_first_3(soup, table)        
        div.append(ul_first)
        div.append(ul_last)
        table.replaceWith(div)
def linearize_cols_2(soup, table):
    if table.get('id') == "linearize-cols-2":
        ul = Tag(soup, "ul")
        ul["class"] = "ul-container"
        ul_last = get_last_two(soup, table)
        ul_first = get_first_two(soup, table)        
        ul.append(ul_first)
        ul.append(ul_last)
        table.replaceWith(ul)
Example #28
0
def shorten_link(soup, link):
    api = googl.Googl(API_KEY)
    googl_link = api.shorten(link.get('href'))
    new_link = Tag(soup, 'a')
    new_link['href'] = googl_link.get('id', None)
    if new_link.get('href', None):
        new_link.setString(link.text)
        return new_link
    else:
        return None
Example #29
0
def shorten_link(soup, link):
    api = googl.Googl(API_KEY)
    googl_link = api.shorten(link.get('href'))
    new_link = Tag(soup, 'a')
    new_link['href'] = googl_link.get('id', None)
    if new_link.get('href', None):
        new_link.setString(link.text)
        return new_link
    else:
        return None
Example #30
0
 def CreateSidebar(self, tag):
     '''Создаем sidebar'''
     h3 = Tag(self.soup, 'h3')
     h3.string = self.textShortCap
     tag.append(h3)
     if self._Probability(20):
         tag.append(self.CreateParagraph())
     if self._Probability(90):
         tag.append(self.CreateList(0))
     else:
         tag.append(self.CreateSelect())
Example #31
0
 def CreateSidebar(self, tag):
     '''Создаем sidebar'''
     h3 = Tag(self.soup, 'h3')
     h3.string = self.textShortCap
     tag.append(h3)
     if self._Probability(20):
         tag.append(self.CreateParagraph())
     if self._Probability(90):
         tag.append(self.CreateList(0))
     else:
         tag.append(self.CreateSelect())
Example #32
0
def replaceJavascript(base_url, soup):
    for js in soup.findAll('script', {'src': re.compile('.+')}):
        try:
            real_js = get_content(resolve_path(base_url, js['src']))
            real_js = real_js.replace('</', 'u003c/')
            js_tag = Tag(soup, 'script')
            js_tag.insert(0, NavigableString(real_js))
            js.replaceWith(js_tag)
        except Exception,e:
            print 'failed to load javascript from %s' % js['src']
            print e
def createParentUlTag(targetSoup):
    parentUlTag = Tag(targetSoup, 'ul', attrs={'class' : 'xbreadcrumbs', 'id' : 'breadcrumbs'})
    topListTag = Tag(targetSoup, 'li')
    topAnchorTag = Tag(targetSoup, 'a', attrs={'href' : SITE_DOMAIN})
    topAnchorTag.append(NavigableString('TOP'))
    topListTag.append(topAnchorTag)
    parentUlTag.append(topListTag)
    return parentUlTag
Example #34
0
  def FixTableHeadings(self):
    '''Fixes the doxygen table headings.

    This includes:
      - Using bare <h2> title row instead of row embedded in <tr><td> in table
      - Putting the "name" attribute into the "id" attribute of the <tr> tag.
      - Splitting up tables into multiple separate tables if a table
        heading appears in the middle of a table.

    For example, this html:
     <table>
      <tr><td colspan="2"><h2><a name="pub-attribs"></a>
      Data Fields List</h2></td></tr>
      ...
     </table>

    would be converted to this:
     <h2>Data Fields List</h2>
     <table>
      ...
     </table>
    '''

    table_headers = []
    for tag in self.soup.findAll('tr'):
      if tag.td and tag.td.h2 and tag.td.h2.a and tag.td.h2.a['name']:
        #tag['id'] = tag.td.h2.a['name']
        tag.string = tag.td.h2.a.next
        tag.name = 'h2'
        table_headers.append(tag)

    # reverse the list so that earlier tags don't delete later tags
    table_headers.reverse()
    # Split up tables that have multiple table header (th) rows
    for tag in table_headers:
      print "Header tag: %s is %s" % (tag.name, tag.string.strip())
      # Is this a heading in the middle of a table?
      if tag.findPreviousSibling('tr') and tag.parent.name == 'table':
        print "Splitting Table named %s" % tag.string.strip()
        table = tag.parent
        table_parent = table.parent
        table_index = table_parent.contents.index(table)
        new_table = Tag(self.soup, name='table', attrs=table.attrs)
        table_parent.insert(table_index + 1, new_table)
        tag_index = table.contents.index(tag)
        for index, row in enumerate(table.contents[tag_index:]):
          new_table.insert(index, row)
      # Now move the <h2> tag to be in front of the <table> tag
      assert tag.parent.name == 'table'
      table = tag.parent
      table_parent = table.parent
      table_index = table_parent.contents.index(table)
      table_parent.insert(table_index, tag)
Example #35
0
def replace_courier(soup):
    """Lacking a better option, I use courier font to mark <code>
    within tinyMCE. And I want to turn that into real code tags.

    Most users won't be needing this(?), so this code is not called anywhere
    but kept for reference
    """
    for t in soup.findAll(lambda s:s.has_key('style') and 'courier' in s['style']):
        tag = Tag(soup, 'code')
        while t.contents:
            tag.append(t.contents[0])
        t.replaceWith(tag)
Example #36
0
 def _number_sections(self, soup):
     count = 1
     for para in soup.find('div', 'md').findAll(['p'], recursive=False):
         a = Tag(soup, 'a', [
             ('class', 'p-anchor'),
             ('id', 'p_%d' % count),
             ('href', '#p_%d' % count),
         ])
         a.append(str(count))
         para.insert(0, a)
         para.insert(1, ' ')
         count += 1
Example #37
0
    def FixTableHeadings(self):
        '''Fixes the doxygen table headings.

    This includes:
      - Using bare <h2> title row instead of row embedded in <tr><td> in table
      - Putting the "name" attribute into the "id" attribute of the <tr> tag.
      - Splitting up tables into multiple separate tables if a table
        heading appears in the middle of a table.

    For example, this html:
     <table>
      <tr><td colspan="2"><h2><a name="pub-attribs"></a>
      Data Fields List</h2></td></tr>
      ...
     </table>

    would be converted to this:
     <h2>Data Fields List</h2>
     <table>
      ...
     </table>
    '''

        table_headers = []
        for tag in self.soup.findAll('tr'):
            if tag.td and tag.td.h2 and tag.td.h2.a and tag.td.h2.a['name']:
                #tag['id'] = tag.td.h2.a['name']
                tag.string = tag.td.h2.a.next
                tag.name = 'h2'
                table_headers.append(tag)

        # reverse the list so that earlier tags don't delete later tags
        table_headers.reverse()
        # Split up tables that have multiple table header (th) rows
        for tag in table_headers:
            print("Header tag: %s is %s" % (tag.name, tag.string.strip()))
            # Is this a heading in the middle of a table?
            if tag.findPreviousSibling('tr') and tag.parent.name == 'table':
                print("Splitting Table named %s" % tag.string.strip())
                table = tag.parent
                table_parent = table.parent
                table_index = table_parent.contents.index(table)
                new_table = Tag(self.soup, name='table', attrs=table.attrs)
                table_parent.insert(table_index + 1, new_table)
                tag_index = table.contents.index(tag)
                for index, row in enumerate(table.contents[tag_index:]):
                    new_table.insert(index, row)
            # Now move the <h2> tag to be in front of the <table> tag
            assert tag.parent.name == 'table'
            table = tag.parent
            table_parent = table.parent
            table_index = table_parent.contents.index(table)
            table_parent.insert(table_index, tag)
Example #38
0
    def parse_content(self, content, attachments, tags):

        soup = BeautifulSoup(content)
        pattern = re.compile(r'<.*?src="\?hash=(\w+?)".*?>')

        # images
        for match in soup.findAll('img'):

            filehashmatch = pattern.search(str(match))
            if filehashmatch:
                filehash = filehashmatch.group(1)
                filename = next((l['filename'] for l in attachments if l['hash'] == filehash), None)

                if filename is not None:
                    importedname = self.import_file(filename)
                    match.replaceWith(Tag(soup, 'img', [('src', importedname)]))


        # pdfs
        for match in soup.findAll('embed', {"type": "evernote/x-pdf"}):

            filehashmatch = pattern.search(str(match))
            if filehashmatch:
                filehash = filehashmatch.group(1)
                filename = next((l['filename'] for l in attachments if l['hash'] == filehash), None)

                if filename is not None:
                    # convert pdf -> image
                    images = pdf2image(filename)

                    # import each jpg
                    imageTags = Tag(soup, "span")
                    for image in images:
                        importedname = self.import_file(image)
                        # add new image tag
                        imageTags.insert(images.index(image), Tag(soup, 'img', [('src', importedname)]))

                    # replace embed with <img src...> for each image
                    match.replaceWith(imageTags)

        # TODO: audio
        # TODO: video


        #plugins          

        # TODO: qa-format as in Supermemo
        #for match in soup.find(string=re.compile("A:")):
        #    match['class'] = match.get('class', []) + ['Evernote2Anki-Highlight']
        


        return str(soup).decode('utf-8')
Example #39
0
def replace_courier(soup):
    """Lacking a better option, I use courier font to mark <code>
    within tinyMCE. And I want to turn that into real code tags.

    Most users won't be needing this(?), so this code is not called anywhere
    but kept for reference
    """
    for t in soup.findAll(lambda s:
                          ('style' in s) and 'courier' in s['style']):
        tag = Tag(soup, 'code')
        while t.contents:
            tag.append(t.contents[0])
        t.replaceWith(tag)
Example #40
0
def generate_heatmap(intensities):
    # Load the SVG map
    svg = open('counties.svg', 'r').read()
    # Load into Beautiful Soup
    soup = BeautifulSoup(svg, selfClosingTags=['defs', 'sodipodi:namedview'])
    # Find counties
    paths = soup.findAll('path')
    colors = [
        "#DEEBF7", "#C6DBEF", "#9ECAE1", "#6BAED6", "#4292C6", "#2171B5",
        "#08519C", "#08306B"
    ]
    min_value = min(intensities.values())
    max_value = max(intensities.values())
    scalefactor = (len(colors) - 1) / (log(max_value + 1) - log(min_value + 1))
    # County style
    path_style = 'font-size:12px;fill-rule:nonzero;stroke:#FFFFFF;stroke-opacity:1;stroke-width:0.1;stroke-miterlimit:4;stroke-dasharray:none;stroke-linecap:butt;marker-start:none;stroke-linejoin:bevel;fill:'
    # we will append this hover tooltip after each county path
    hover_text = '''<text id="popup-%s" x="%s" y="%s" font-size="10" fill="black" visibility="hidden">%s (%s)<set attributeName="visibility" from="hidden" to="visible" begin="%s.mouseover" end="%s.mouseout"/></text>'''
    for p in paths:
        if p['id'] not in ["State_Lines", "separator"]:
            try:
                count = intensities[p['id']]
            except:
                count = 0
            x, y = (p['d'].split()[1]).split(',')
            # insert a new text tag for the county hover tooltip...
            p.parent.insert(0, Tag(soup, 'text', [("id", 'popup-' + p['id'])]))
            hover = soup.find("text", {"id": 'popup-' + p['id']})
            hover.insert(1, "%s (%s)" % (p['inkscape:label'], str(count)))
            # add attributes to that text tag...
            hover['x'] = 250
            hover['y'] = 20
            hover['font-size'] = "20"
            hover['fill'] = "black"
            hover['visibility'] = "hidden"
            hover.insert(0,
                         Tag(soup, 'set', [("begin", p['id'] + '.mouseover')]))
            set_tag = soup.find("set", {"begin": p['id'] + '.mouseover'})
            set_tag['attributeName'] = "visibility"
            set_tag['from'] = "hidden"
            set_tag['to'] = "visible"
            set_tag['end'] = p['id'] + '.mouseout'
            color_class = min(int(scalefactor * log(count + 1)),
                              len(colors) - 1)
            # color_class = int((float(len(colors)-1) * float(count - min_value)) / float(max_value - min_value))
            # if count > 0:
            #   print color_class
            color = colors[color_class]
            p['style'] = path_style + color
    print soup.prettify()
def linearize_rows_1(soup, table):
    if table.get('id') == "linearize-rows-1":
        div = Tag(soup, "div")
        div["class"] = "center"
        for tr in table.findAll("tr"):
            lista = tr.findAll("td")
            for td in lista:
                for p in td.findAll("p"):
                    p.name = "span"
                td.name = "span"
                if td == lista[-1]:
                    td = BeautifulSoup(td.prettify())
                else:
                    td = BeautifulSoup(td.prettify() + '<span> | </span>')
                div.append(td)
        table.replaceWith(div)
Example #42
0
def makeHTMLQuestion(fn, htmldata):
    soup = BeautifulSoup(htmldata)
    #add JS
    soup.find('body')['onload'] = "populateAssignmentID('myAssignmentId')"
    soup.find('head').insert(0, SUBMIT_JS)
    #replace forms
    forms = soup.findAll('form')
    if forms:
        for form in forms:
            if not form.has_key('method'):
                form['method'] = 'POST'
            if not form.has_key('action'):
                if testmode:
                    form[
                        'action'] = 'http://workersandbox.mturk.com/mturk/externalSubmit'
                else:
                    form[
                        'action'] = 'http://www.mturk.com/mturk/externalSubmit'
            if not form.has_key('onSubmit'):
                form['onSubmit'] = "return verifyTurkSubmit('myAssignmentId');"
            inputtag = Tag(soup, 'input')
            inputtag['type'] = 'hidden'
            inputtag['name'] = 'assignmentId'
            inputtag['id'] = 'myAssignmentId'
            inputtag['value'] = ''
            form.insert(0, inputtag)
    mainurl = uploadfile(fn, str(soup))
    for sub in soup.findAll('img'):
        # TODO
        fn = dirname(fn) + '/' + sub['src']
        uploadfile(fn)
    return ExternalQuestion(escape(mainurl), frame_height)
Example #43
0
def content_absolute_links(content, image=None):
    from django.contrib.sites.models import Site
    current_site = Site.objects.get(pk=settings.SITE_ID)

    def abs_url(url):

        parsed = urlparse.urlparse(url)
        if parsed.netloc == parsed.scheme == '':
            url = urlparse.urljoin('http://{0}'.format(current_site.domain),
                                   url)
        return url

    soup = BeautifulSoup(content)

    if image:
        img = Tag(soup, 'img', [('src', image)])
        soup.insert(0, img)

    for link in soup.findAll('a'):
        link['href'] = abs_url(link['href'])

    for link in soup.findAll('img'):
        link['src'] = abs_url(link['src'])

    return unicode(soup)
Example #44
0
    def get_mobile_content(self, obj):
        if obj.mobile_content:
            content = obj.mobile_content
        else:
            content = obj.content

        if not self.host_det:  # apps only
            content = content.replace("\n<br />\n", "\n")
        elif self.host_det == "android":
            content = content.replace("\n<br />\n", "\n")
            soup = BeautifulSoup(content)
            # if soup.findAll('iframe'):
            #     gh = soup.findAll('iframe')[0]['src']
            #     hh = soup.findAll('iframe')
            for p in soup.findAll("iframe"):
                if "youtube" in p['src']:
                    newTag = Tag(soup, "a")
                    newTag.attrs.append(("src", p.get('src')))
                    p.append(newTag)
            content = unicode(soup)
        if obj.source is not None and obj.source != '':
            content = content + "<p>Sources: " + obj.source.replace(
                "<p>", "").replace("</p>", "") + "</p>"
        else:
            content = content

        content = obj.get_modified_content(content, content_type='mobile')
        return content
def linearize_cols_1(soup, table):
    if table.get('id') == "linearize-cols-1":
        ul = Tag(soup, "ul")
        ul["class"] = "linearized"
        for td in table.findAll("td"):
            for p in td.findAll("p"):
                p.name = "span"
            try:
                text = ''.join(td.findAll(text=True))
                text = text.strip()
                if text != '' and text != '&nbsp;':
                    td.name = "li"
                    ul.append(td)
            except:
                pass
        table.replaceWith(ul)
Example #46
0
def select_calendar(month=None, year=None):
    now = datetime.now()
    day = now.day
    cal = calendar.HTMLCalendar()
    cal.setfirstweekday(6)
    month_table = cal.formatmonth(year, month)
    soup = BeautifulSoup(month_table)
    outfile = open("myHTML.html", 'w')

    for data in soup.findAll('td'):
        if data['class'] != "noday":
            days = data.findAll(text=True)
            for oneday in days:
                day = NavigableString(oneday)
                oneday.extract()
                addatag = Tag(soup, 'input')
                addatag['type'] = "submit"
                addatag['name'] = "meetingday"
                addatag['value'] = day
                data.insert(0, addatag)

    outfile.write(soup.prettify())
    outfile.close()
    infile = open("myHTML.html", 'r')
    calfile = ""
    for line in infile:
        calfile = calfile + line
    infile.close()

    return calfile
Example #47
0
    def _set(self, topic, key, value, topic_attr=None):
        """Set key and value at topic
        
        :return: success status
        :rtype: bool"""

        # In case it is an empty document
        if not unicode(self._soup).strip().startswith("<?xml"):
            self._soup.insert(0, NavigableString(self.HEADER))

        # In case settings root is not defined
        settings = self._soup.find(self.root)
        if settings is None:
            self._soup.insert(1, Tag(self._soup, self.root))
            settings = self._soup.find(self.root)

        # Add Topic
        topic_tag = self._set_element(settings, topic.lower(), attr=topic_attr)

        if topic_tag is None:
            return False

        # Add key and value
        key_tag = self._set_element(topic_tag, key.lower(), escape(value))
        # Add "" since XML may introduce whitespaces.
        #key_tag = self._set_element(topic_tag, key, '"{0}"'.format(value))

        return key_tag is not None
Example #48
0
 def sanitize_story(self, story_content):
     soup = BeautifulSoup(story_content.strip())
     fqdn = Site.objects.get_current().domain
     
     for iframe in soup("iframe"):
         url = dict(iframe.attrs).get('src', "")
         youtube_id = self.extract_youtube_id(url)
         if youtube_id:
             a = Tag(soup, 'a', [('href', url)])
             img = Tag(soup, 'img', [('style', "display: block; 'background-image': \"url(https://%s/img/reader/youtube_play.png), url(http://img.youtube.com/vi/%s/0.jpg)\"" % (fqdn, youtube_id)), ('src', 'http://img.youtube.com/vi/%s/0.jpg' % youtube_id)])
             a.insert(0, img)
             iframe.replaceWith(a)
         else:
             iframe.extract()
     
     return unicode(soup)
Example #49
0
def initServerInfoBase(fileName):
    """
    @description: Intializes soup for the Beautiful soup parser. Reads the exisitng Data from the fileName paramter.
    @todo:None
    @param xml: String, Name of file to be loaded in soup.
    @return: Boolean, True if a successful, else False
    """
    if os.path.exists(fileName):
        try:
            f = open(fileName, "r")
        except:
            return None, False
        xml = f.read()
        f.close()
        soup = BeautifulStoneSoup(xml)
        serverinfolist = soup.findAll("serverinfo")
    else:
        serverinfolist = []
        soup = BeautifulSoup()
        xml = "null"

    if len(serverinfolist) == 0:
        serverinfo = Tag(soup, "serverinfo")
        soup.insert(0, serverinfo)

    return soup, True
Example #50
0
def add_noindex_to_a(text):
    doc = BeautifulSoup(text)
    host_orig = urlparse(settings.SITE_URL)[1]

    for a in doc.findAll('a'):
        try:
            host = urlparse(a['href'])[1]
        except:
            pass

        if a.findParent('noindex') == None:
            if host != host_orig:
                noindex = Tag(doc, "noindex")
                a.replaceWith(noindex)
                a['rel'] = 'nofollow'
                noindex.insert(0, a)
    return unicode(doc)
Example #51
0
 def CreateMetaKeywords(self):
     '''Создаем meta-тег keywords'''
     meta = Tag(self.soup, 'meta')
     meta['name'] = 'keywords'
     meta['content'] = '%s%s' % (
         random.choice(self._GetFileLines('keywords1.txt')).strip(),
         random.choice(self._GetFileLines('keywords2.txt')).strip())
     return meta
Example #52
0
 def CreateImage(self):
     '''Создаем img'''
     img = Tag(self.soup, 'img')
     img['src'] = self.urlImage
     img['alt'] = self.textShort.replace('|]', '||]')
     if self._Probability(30):
         img['title'] = self.textShort
     return img
Example #53
0
 def CreateListItem(self, liClass=''):
     '''Создаем li'''
     li = Tag(self.soup, 'li')
     if liClass != '':
         li['class'] = liClass
     else:
         self.AppendIds(li, 0, 50)
     return li
Example #54
0
 def startElement(self, name, attrs):
     #print("startElement", name, attrs, dir(attrs))
     self.endData()
     tag = Tag(name, attrs.items(), self.currentTag, self.previous)
     if self.previous:
         self.previous.next = tag
     self.previous = tag
     self.pushTag(tag)
def linearize_states(soup, table):
    if table.get('id') == "linearize-states":
        ul = Tag(soup, "ul")
        ul["class"] = "text-level3"
        tag = None
        for tr in table.findAll("tr"):
            tr.name = "span"
            tr["class"] = "spaced"
            for td in tr.findAll("td"):
                if td["width"] == "40%":
                    td.name = "li"
                    tag = td
                else:
                    tag.append(td)
                    td.name = "ul"
            ul.append(tr)
        table.replaceWith(ul)
Example #56
0
def get_list_for_key(name, children):
    """
    Takes a key and a dictionary containing its children and recursively
    generates HTML lists items. Each item will contain the name and, if it has
    children, an unordered list containing those child items.
    """

    li = Tag(SOUP, "li")
    li.append(NavigableString(name))

    if children:
        ul = Tag(SOUP, "ul")
        for k, v in children.items():
            ul.append(get_list_for_key(k, v))
        li.append(ul)

    return li