Beispiel #1
0
def sanitize_html(value, valid_tags=VALID_TAGS):
    """
    Strips unwanted markup out of HTML.
    """
    # TODO: This function needs unit tests.
    soup = BeautifulSoup(value)
    comments = soup.findAll(text=lambda text: isinstance(text, Comment))
    [comment.extract() for comment in comments]
    # Some markup can be crafted to slip through BeautifulSoup's parser, so
    # we run this repeatedly until it generates the same output twice.
    newoutput = soup.renderContents()
    while 1:
        oldoutput = newoutput
        soup = BeautifulSoup(newoutput)
        for tag in soup.findAll(True):
            if tag.name not in valid_tags:
                tag.hidden = True
            else:
                tag.attrs = [(attr, value) for attr, value in tag.attrs
                             if attr in valid_tags[tag.name]]
        newoutput = soup.renderContents()
        if oldoutput == newoutput:
            break
    warn("This function is deprecated. Please use the bleach library",
         DeprecationWarning)
    return unicode(newoutput, 'utf-8')
Beispiel #2
0
def sanitize_html(value, valid_tags=VALID_TAGS):
    """
    Strips unwanted markup out of HTML.

    .. deprecated:: 0.2.5
       Use the bleach library instead.

    """
    # TODO: This function needs unit tests.
    soup = BeautifulSoup(value)
    comments = soup.findAll(text=lambda text: isinstance(text, Comment))
    [comment.extract() for comment in comments]
    # Some markup can be crafted to slip through BeautifulSoup's parser, so
    # we run this repeatedly until it generates the same output twice.
    newoutput = soup.renderContents()
    while 1:
        oldoutput = newoutput
        soup = BeautifulSoup(newoutput)
        for tag in soup.findAll(True):
            if tag.name not in valid_tags:
                tag.hidden = True
            else:
                tag.attrs = [(a, v) for a, v in tag.attrs if a in valid_tags[tag.name]]
        newoutput = soup.renderContents()
        if oldoutput == newoutput:
            break
    warn("This function is deprecated. Please use the bleach library", DeprecationWarning)
    return unicode(newoutput, 'utf-8')
Beispiel #3
0
def striptags(data):
	soup = BeautifulSoup(data)
	for tag in soup.findAll(True):
		tag.hidden = True
	data = soup.renderContents()
	soup = BeautifulSoup(data)
	comments = soup.findAll(text=lambda text:isinstance(text, Comment))
	[comment.extract() for comment in comments]
	return soup.renderContents()
Beispiel #4
0
def unwrap_html_body(html, css_class=None):
    """Return the content of the body tag for inline display in another
    html document.
    """
    soup = BeautifulSoup(html, fromEncoding='utf8')
    if soup.body:
        soup = soup.body
    body_soup = BeautifulSoup('<div>%s</div>' % soup.renderContents(), fromEncoding='utf8')
    if css_class:
        body_soup.div['class'] = css_class
    body_style = soup.get('style')
    if body_style:
        body_soup.div['style'] = body_style
    return body_soup.renderContents()
Beispiel #5
0
def sanitize(value, allowed_tags=None):
    """
    Jacked from: http://www.djangosnippets.org/snippets/1655/

    Argument should be in form 'tag2:attr1:attr2 tag2:attr1 tag3', where tags
    are allowed HTML tags, and attrs are the allowed attributes for that tag.
    """
    js_regex = re.compile(r'[\s]*(&#x.{1,7})?'.join(list('javascript')))

    WIKI_ALLOWED_TAGS = getattr(settings, 'WIKI_ALLOWED_TAGS', '')
    if allowed_tags is None:
        allowed_tags = WIKI_ALLOWED_TAGS
    else:
        allowed_tags = '%s %s'%(allowed_tags, WIKI_ALLOWED_TAGS)

    allowed_tags = [tag.split(':') for tag in allowed_tags.split()]
    allowed_tags = dict((tag[0], tag[1:]) for tag in allowed_tags)

    soup = BeautifulSoup(value)
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()

    for tag in soup.findAll(True):
        if tag.name not in allowed_tags:
            tag.hidden = True
        else:
            tag.attrs = [(attr, js_regex.sub('', val)) for attr, val \
                in tag.attrs
                         if attr in allowed_tags[tag.name]]

    return soup.renderContents().decode('utf8')
def check_html(value, environ):
    """
    This function does the actual validation.
    environ must have 'allowed_tags' and
    'allowed_attributes' in it.
    
    Removes unwanted tags, attributes and
    comments.
    
    Value should be the string to be validated.
    """
    if type(value) != unicode:
        try:
            value = unicode(value)
        except UnicodeDecodeError:
            raise InvalidTiddlerError(
                'HTML Validation Failed: contents of tiddler not a valid string.'
            )

    url_regex = re.compile(r'[\s]*(&#x.{1,7})?'.join(list('javascript:')))

    soup = BeautifulSoup(value)

    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()

    for tag in soup.findAll(True):
        if tag.name not in environ['tiddlyweb.config']['allowed_tags']:
            tag.hidden = True
        tag.attrs = [
            (attr, url_regex.sub('', val)) for attr, val in tag.attrs
            if attr in environ['tiddlyweb.config']['allowed_attributes']
        ]

    return soup.renderContents().decode('utf8')
Beispiel #7
0
    def get(self):
        self.response.headers["Content-Type"] = "text/html"

        url = self.request.get("url")

        result = urlfetch.fetch(url=url, allow_truncated=True, deadline=10)
        if result.status_code == 200:
            soup = BeautifulSoup(result.content)
            if soup.base == None:
                base = Tag(soup, "base")
                base["href"] = url
                soup.head.insert(0, base)

            script = Tag(soup, "iframe")
            script["src"] = "http://backplanejs.appspot.com/rdfa-viewer.html"
            script[
                "style"
            ] = "background-color: transparent; width: 100%; height: 300px; padding: 0; margin: 0; overflow: hidden; position: fixed; bottom: 0; left: 0; border: 0; z-index: 999; "
            script["frameborder"] = "0"
            script["hspace"] = "0"
            script["vspace"] = "0"
            script["scrolling"] = "no"
            script["allowtransparency"] = "yes"
            soup.body.insert(0, script)

            self.response.out.write(soup.renderContents())
def removeReadmoreDivs():
    wc = 0
    coll = getCollDrops().find()
    wc = coll.count()
    for d in coll:
        wc = wc - 1
        if wc % 1000 == 0:
            print(wc)
        for l in d["followed_links"]:
            html = ""
            if "simplehtml" in l:
                soup = BeautifulSoup(l["simplehtml"])
                for tag in soup.findAll(True):
                    if "id" in tag:
                        if "hide_" in str(tag["id"]) and "nr_hide_" not in str(
                                tag["id"]):
                            print(wc, tag["id"])
                            tag.hidden = True
                        if "readmore_" in str(
                                tag["id"]) and "nr_readmore_" not in str(
                                    tag["id"]):
                            tag.extract()
                html = soup.renderContents()
                l["simplehtml"] = html
                getCollDrops().save(d, safe=True)
def get_ceo(url):
    page = urllib.urlopen(url)
    data_boxen = SoupStrainer('div', {'class': 'snapUniqueData'})
    soup = BeautifulSoup(page, parseOnlyThese=data_boxen)
    for tag in soup.findAll():
        tag.hidden = True
    ceo_table = soup.renderContents().decode('utf8')
    ceo_table = re.sub(r'(\t+?|Rank:.*?\))', '',
                       ceo_table).replace(' vs. Top 10',
                                          '').replace('Compare tool:',
                                                      'Company:')
    ceo_table = re.sub(r'(\n)', '|', ceo_table).replace('||', "")
    ceo_array = ceo_table.split('|')
    ceo_array = [re.sub('^\s', '', field).split(': ') for field in ceo_array]
    ceo_dict = dict()
    for field in ceo_array:
        if field[0]:
            if len(field):
                if 1 < len(field) <= 2:
                    ceo_dict[str(field[0])] = field[1]
                elif len(field) == 1:
                    ceo_dict['Address1'] = field[0]
    if ceo_dict.keys():
        #print ceo_dict
        save_ceo(**ceo_dict)
Beispiel #10
0
def sanitize(value, cfg=None, base_url=None):
    if cfg is None:
        cfg = config.sanitizer
    if value is None:
        return value
    urlAttrs = 'href src'.split()  # Attributes which should have a URL
    soup = BeautifulSoup(value)
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()  # Get rid of comments
    for tag in soup.findAll(True):
        if tag.name in cfg["replacements"]:
            tag.name = cfg["replacements"][tag.name]
        elif tag.name not in cfg["allowed_elements"]:
            tag.hidden = True
        attrs = tag.attrs
        tag.attrs = []
        for attr, val in attrs:
            is_allowed1 = attr in cfg["allowed_attributes"].get(tag.name, [])
            is_allowed2 = attr in cfg["allowed_attributes"].get("*", [])
            if is_allowed1 or is_allowed2:
                val = re_scripts.sub('', val)  # Remove scripts (vbs & js)
                if attr in urlAttrs:
                    val = urljoin(base_url, val)  # Calculate the absolute url
                tag.attrs.append((attr, val))
    return soup.renderContents().decode('utf8')
Beispiel #11
0
def AI(jid, query, querer, group):
    taken = ["wiki", "google", "image", "auth code"]
    for x in taken:
        if query.lower().startswith(x):
            return
    #global therapist
    #reply = therapist.respond(query)
    global botsessions
    global bot1
    if jid not in botsessions:
        botsessions[jid] = bot1.create_session()
    reply = botsessions[jid].think(query)

    VALID_TAGS = ['br']
    soup = BeautifulSoup(reply)

    for tag in soup.findAll(True):
        if tag.name not in VALID_TAGS:
            tag.hidden = True

    reply = soup.renderContents()
    reply = reply.replace('<br />', '\n')

    if group:
        if reply != ".":
            modules.sender.message_queue(jid, reply)
    else:
        modules.sender.message_queue(jid, reply)
Beispiel #12
0
def sanitize_html(value): #TODO execute at topic.save. 
    valid_tags = [
                    'br', 'strong', 'b', 'p', 'div', 'em', 'u', 'strike', 'ul',
                    'li', 'ol', 'a', 'img', 'highlight', 'sup', 'sub', 'span',
                    'big', 'small', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7',
                    'h8', 'pre', 'address', 'code', 'kbd', 'samp', 'var',
                    'del', 'ins', 'cite', 'q', 'bdo','embed', 'object'
                 ] # embed og object er unsafe...

    valid_attrs = 'href src width name height'.split() # kun src og href hvis de starter med http://
    soup = BeautifulSoup(value)
    for comment in soup.findAll(
        text=lambda text: isinstance(text, Comment)):
        comment.extract()
    for tag in soup.findAll(True):
        
        if tag.name not in valid_tags:

            tag.hidden = True

        def passes(attr, val): # It's important that these two attrs start with either www or http imo.
            if not attr == "src" and not attr == "href":
                return True
            else: 
                if str(val).startswith("http") or str(val).startswith("www"):
                    return True
            return False
            
        tag.attrs = [(attr, val) for attr, val in tag.attrs
                     if attr in valid_attrs and passes(attr, val)]
                   
    return re.sub(RE_EXXX, '', soup.renderContents().decode('utf8'))
Beispiel #13
0
    def text_with_abs_url(self,request=None):

        page = BeautifulSoup(self.text.rendered)
        for image in page.findAll('img'):
            src_image = image.get('src')
            url_validate = URLValidator()
            try:
                url_validate(src_image)
            except Exception as e:
                print e
                src_image = request.build_absolute_uri(src_image)

            image['src'] = src_image

        for a in page.findAll('a'):
            href = a.get('href')
            url_validate = URLValidator()
            try:
                url_validate(href)

            except Exception as e:
                href = request.build_absolute_uri(href)
            a['href'] = href


        return page.renderContents()
Beispiel #14
0
def removecut(string):
    soup = BeautifulSoup(string, selfClosingTags=['img','br'])
    tag = soup.find('yvcut')
    if not tag: return string
    tag.extract()
    string = soup.renderContents()
    return string    
Beispiel #15
0
def sanitize_html(value):
    r = re.compile(r'[\s]*(&#x.{1,7})?'.join(list('javascript:')), re.IGNORECASE|re.MULTILINE)
    validTags = 'p span strong em u br a ul ol li blockquote'.split()
    validAttrs = 'href style'.split()
    validStyles = 'font-family font-size'.split()
    soup = BeautifulSoup(value)
    for comment in soup.findAll(text=lambda text:isinstance(text, Comment)):
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in validTags:
            tag.hidden = True
        # tag.attrs = [(attr, r.sub('', val)) for attr, val in tag.attrs if attr in validAttrs]
        attrs = []
        for attr, val in tag.attrs:
            if attr in validAttrs:
                v = r.sub('', val)
                if attr == 'style':
                    try:
                        # Remove unsupported styles
                        vx = [x.strip() for x in v.strip().lower().split(';') if x.strip()]
                        for a in vx:
                            if a:
                                x, y = a.split(':')
                                if x.strip() not in validStyles:
                                    v = ''
                                    break
                    except:
                        v = ''
                        break
                attrs.append((attr, v))
        tag.attrs = attrs
    return soup.renderContents().decode('utf8')
Beispiel #16
0
def clean_html(value, allowed_tags=VALID_TAGS, allowed_style=VALID_STYLE):
    """Argument should be in form 'tag2:attr1:attr2 tag2:attr1 tag3', where tags
    are allowed HTML tags, and attrs are the allowed attributes for that tag.
    """
    js_regex = re.compile(r'[\s]*(&#x.{1,7})?'.join(list('javascript')))
    if not isinstance(allowed_tags, dict):
        allowed_tags = [tag.split(':') for tag in allowed_tags.split()]
        allowed_tags = dict((tag[0], tag[1:]) for tag in allowed_tags)

    try:
        soup = BeautifulSoup(value)
        for comment in soup.findAll(
                text=lambda text: isinstance(text, Comment)):
            comment.extract()

        for tag in soup.findAll(True):
            if tag.name not in allowed_tags:
                tag.hidden = True
            else:
                tag.attrs = [(attr, js_regex.sub('', val))
                             for attr, val in tag.attrs
                             if attr in allowed_tags[tag.name]]
                if tag.has_key('style'):
                    css_properties = cssutils.css.CSSStyleDeclaration(
                        cssText=tag['style'])
                    for x in css_properties:
                        if not (x.wellformed\
                                and x.name.lower() in VALID_STYLE\
                                and x.valid):
                            css_properties.removeProperty(x.name)
                    tag['style'] = css_properties.cssText
        return soup.renderContents().decode('utf8')
    except:
        return ''
Beispiel #17
0
def sanitize(value, invalid=""):
    try:
        from BeautifulSoup import BeautifulSoup, Comment
    
    except:
        return value
     
    valid_tags = 'a em code br p img i u h1 h2 h3 h4 h5 h6 pre'.split()
    valid_attrs = 'href src title'.split()
    [valid_tags.remove(item) for item in invalid.split()]
    soup = BeautifulSoup(value)
    for comment in soup.findAll(
        text=lambda text: isinstance(text, Comment)):
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in valid_tags:
            tag.hidden = True
        else:
            for attr, val in tag.attrs:
                if re.match('javascript:', val, re.I) is not None:
                    tag.hidden = True
            tag.attrs = [(attr, val) for attr, val in tag.attrs if attr in valid_attrs]

    cleanup = {
               "<br/><br/>":"<br/>",
               "<p></p>":"",
              }
    s = soup.renderContents().decode('utf8')
    for (old, new) in cleanup.items(): s = s.replace(old, new)

    return s
def track_links(content, context):
    """Convert all links in the template for the user
    to track his navigation"""
    if not context.get('uidb36'):
        return content

    soup = BeautifulSoup(content)
    for link_markup in soup('a'):
        if link_markup.get('href') and \
               'no-track' not in link_markup.get('rel', ''):

            # --- tracking ignore anchor --- start ----------------------------
            if TRACKING_IGNORE_ANCHOR:
                if '#' in link_markup.get('href')[0]:
                    continue
            # --- tracking ignore anchor --- end ------------------------------

            link_href = link_markup['href']
            link_title = link_markup.get('title', link_href)
            link, created = Link.objects.get_or_create(url=link_href,
                                                       defaults={'title': link_title})
            link_markup['href'] = 'http://%s%s' % (context['domain'], reverse('newsletter_newsletter_tracking_link',
                                                                              args=[context['newsletter'].slug,
                                                                                    context['uidb36'], context['token'],
                                                                                    link.pk]))
    if USE_PRETTIFY:
        return soup.prettify()
    else:
        return soup.renderContents()
Beispiel #19
0
def sanitize_html(value, clear=False, remove_pre=False, base_url=None):
    rjs = r'[\s]*(&#x.{1,7})?'.join(list('javascript:'))
    rvb = r'[\s]*(&#x.{1,7})?'.join(list('vbscript:'))
    re_scripts = re.compile('(%s)|(%s)' % (rjs, rvb), re.IGNORECASE)
    if not clear:
        taglist = 'p i strong b ul ol li a h1 h2 h3 br img blockquote div'
        if not remove_pre:
            taglist += ' pre'
        validTags = taglist.split()
        validAttrs = 'href src width height lang class'.split()
        urlAttrs = 'href src'.split()  # Attributes which should have a URL
    else:
        validTags = ''
        validAttrs = ''
        urlAttrs = []
    soup = BeautifulSoup(value)
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        # Get rid of comments
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in validTags:
            tag.hidden = True
        attrs = tag.attrs
        tag.attrs = []
        for attr, val in attrs:
            if attr in validAttrs:
                val = re_scripts.sub('', val)  # Remove scripts (vbs & js)
                if attr in urlAttrs:
                    val = urljoin(base_url, val)  # Calculate the absolute url
                tag.attrs.append((attr, val))
    return soup.renderContents().decode('utf8')
Beispiel #20
0
def sanitizeHtml(value, base_url=None):
    """
    Utility method for sanitizing user entered html.
    """
    rjs = r'[\s]*(&#x.{1,7})?'.join(list('javascript:'))
    rvb = r'[\s]*(&#x.{1,7})?'.join(list('vbscript:'))
    re_scripts = re.compile('(%s)|(%s)' % (rjs, rvb), re.IGNORECASE)
    validTags = 'img br b blockquote code del dd dl dt em h1 h2 h3 i kbd li ol p pre s sup sub strong strike ul'.split(
    )
    validAttrs = 'href src width height'.split()
    urlAttrs = 'href src'.split()  # Attributes which should have a URL
    soup = BeautifulSoup(value)
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        # Get rid of comments
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in validTags:
            tag.hidden = True
        attrs = tag.attrs
        tag.attrs = []
        for attr, val in attrs:
            if attr in validAttrs:
                val = re_scripts.sub('', val)  # Remove scripts (vbs & js)
                if attr in urlAttrs:
                    val = urljoin(base_url, val)  # Calculate the absolute url
                tag.attrs.append((attr, val))

    return soup.renderContents().decode('utf8')
Beispiel #21
0
def sanitizeHtml(value, base_url=None):
    """
    Utility method for sanitizing user entered html.
    """
    rjs = r"[\s]*(&#x.{1,7})?".join(list("javascript:"))
    rvb = r"[\s]*(&#x.{1,7})?".join(list("vbscript:"))
    re_scripts = re.compile("(%s)|(%s)" % (rjs, rvb), re.IGNORECASE)
    validTags = "img br b blockquote code del dd dl dt em h1 h2 h3 i kbd li ol p pre s sup sub strong strike ul".split()
    validAttrs = "href src width height".split()
    urlAttrs = "href src".split()  # Attributes which should have a URL
    soup = BeautifulSoup(value)
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        # Get rid of comments
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in validTags:
            tag.hidden = True
        attrs = tag.attrs
        tag.attrs = []
        for attr, val in attrs:
            if attr in validAttrs:
                val = re_scripts.sub("", val)  # Remove scripts (vbs & js)
                if attr in urlAttrs:
                    val = urljoin(base_url, val)  # Calculate the absolute url
                tag.attrs.append((attr, val))

    return soup.renderContents().decode("utf8")
Beispiel #22
0
def clean_html(html) :

        from BeautifulSoup import BeautifulSoup, Tag

        html = double_br_to_P.sub('<p>', html)
        #import codecs
        #OFD = codecs.open('/dev/stdout','w','utf-8')
        #print >>OFD, html

        soup = BeautifulSoup('<div>%s</div>'%html)

        for tag in soup.findAll('span', style='font-weight: bold;') :
            tag.name = 'b'
        for tag in soup.findAll('span', style='font-style: italic;') :
            tag.name = 'i'
        for tag in soup.findAll('span', style='text-decoration: underline;') :
            tag.name = 'u'

        for tag in soup.findAll(True) :
            if tag.name not in VALID_TAGS :
                tag.hidden = True
            else :
                for attr in tag._getAttrMap().keys() :
                    if attr not in VALID_ATTR :
                        del tag[attr]
            if tag.name == 'a' :
                tag['rel'] = 'nofollow'
                href = tag['href']
                if not href.startswith('http://') :
                    tag['href'] = 'http://' + href
        
        text = soup.renderContents()
        return remove_empty_P.sub('', text)
Beispiel #23
0
def sanitize_html(value, clear=False, remove_pre=False, base_url=None):
    rjs = r'[\s]*(&#x.{1,7})?'.join(list('javascript:'))
    rvb = r'[\s]*(&#x.{1,7})?'.join(list('vbscript:'))
    re_scripts = re.compile('(%s)|(%s)' % (rjs, rvb), re.IGNORECASE)
    if not clear:
        taglist = 'p i strong b ul ol li a h1 h2 h3 br img blockquote div'
        if not remove_pre:
            taglist+=' pre'
        validTags = taglist.split()
        validAttrs = 'href src width height lang class'.split()
        urlAttrs = 'href src'.split() # Attributes which should have a URL
    else:
        validTags = ''
        validAttrs = ''
        urlAttrs = []
    soup = BeautifulSoup(value)
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        # Get rid of comments
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in validTags:
            tag.hidden = True
        attrs = tag.attrs
        tag.attrs = []
        for attr, val in attrs:
            if attr in validAttrs:
                val = re_scripts.sub('', val) # Remove scripts (vbs & js)
                if attr in urlAttrs:
                    val = urljoin(base_url, val) # Calculate the absolute url
                tag.attrs.append((attr, val))
    return soup.renderContents().decode('utf8')
Beispiel #24
0
def clean_html_style(data, element, remove_comments=True, remove_empty=True):
    """removes the style information associated with html element

    >>> t = '<!--  /* Style Definitions */ table.MsoNormalTable	{mso-style-name:"Table Normal";	mso-tstyle-rowband-size:0;	mso-tstyle-colband-size:0;	mso-style-noshow:yes;	mso-style-priority:99;	mso-style-qformat:yes;	mso-style-parent:"";	mso-padding-alt:0in 5.4pt 0in 5.4pt;	mso-para-margin-top:0in;	mso-para-margin-right:0in;	mso-para-margin-bottom:10.0pt;	mso-para-margin-left:0in;	line-height:115%;	mso-pagination:widow-orphan;	font-size:11.0pt;	font-family:"Calibri","sans-serif";	mso-ascii-font-family:Calibri;	mso-ascii-theme-font:minor-latin;	mso-hansi-font-family:Calibri;	mso-hansi-theme-font:minor-latin;} --><p>  </p><p class="MsoNormal" style="margin-bottom: 0.0001pt; line-height: normal;">New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p class="MsoNormal" style="margin-bottom: 0.0001pt; line-height: normal;">?</p><p class="MsoNormal" style="margin-bottom: 0.0001pt; line-height: normal;">According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>'
    >>> clean_html_style(t, 'p')
    '<p>New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p>?</p><p>According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>'
    >>> clean_html_style(t, 'p', remove_empty=False)
    '<p> </p><p>New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p>?</p><p>According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>'
    >>> clean_html_style(t, 'p', remove_comments=False)
    '<!--  /* Style Definitions */ table.MsoNormalTable\t{mso-style-name:"Table Normal";\tmso-tstyle-rowband-size:0;\tmso-tstyle-colband-size:0;\tmso-style-noshow:yes;\tmso-style-priority:99;\tmso-style-qformat:yes;\tmso-style-parent:"";\tmso-padding-alt:0in 5.4pt 0in 5.4pt;\tmso-para-margin-top:0in;\tmso-para-margin-right:0in;\tmso-para-margin-bottom:10.0pt;\tmso-para-margin-left:0in;\tline-height:115%;\tmso-pagination:widow-orphan;\tfont-size:11.0pt;\tfont-family:"Calibri","sans-serif";\tmso-ascii-font-family:Calibri;\tmso-ascii-theme-font:minor-latin;\tmso-hansi-font-family:Calibri;\tmso-hansi-theme-font:minor-latin;} --><p>New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p>?</p><p>According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>'
    """
    try:
        soup = BeautifulSoup(data)
    except:
        soup = BeautifulSoup(data)
    # remove all comments in this html block
    if remove_comments:
        comments = soup.findAll(text=lambda text: isinstance(text, Comment))
        [comment.extract() for comment in comments]

    # remove all occurences of tags like sup, script
    [i.extract() for i in soup.findAll(re.compile('sup|script'))]

    # find all occurences of the "element" tag
    for i in soup.findAll(element):
        text = i.renderContents().strip()
        if text:
            new_tag = Tag(soup, element)
            new_tag.insert(0, text)
            i.replaceWith(new_tag)
        elif remove_empty:
            i.extract()
    return smart_unicode(soup.renderContents())
Beispiel #25
0
def sanitize(html, allowed_tags=ALLOWED_TAGS, allowed_attributes=ALLOWED_ATTRIBUTES, allowed_values=ALLOWED_VALUES, add_attributes=ADD_ATTRIBUTES, remove_comments=True):

    soup = BeautifulSoup(html)

    if remove_comments:
        for comment in soup.findAll(text=lambda text:isinstance(text, Comment)):
            comment.extract()

    for node in soup.findAll(True):
        # remove disallowed tags
        if node.name not in allowed_tags:
            node.extract()
            continue

        # clean attributes
        attrs = []
        if node.name in allowed_attributes:
            for attr, val in node.attrs:
                if attr in allowed_values:
                    if val in allowed_values[attr]:
                        attrs.append((attr, val))

        node.attrs = attrs

        # add attributes
        if node.name in add_attributes:
            node.attrs.extend(add_attributes[node.name])

    return soup.renderContents().decode('utf8')
Beispiel #26
0
def unparse(value):
    """Revert parser activity

    Keyword arguments:
    value -- String

    Returns: String
    """
    value = value.replace('<br />', '\n')
    soup = BeautifulSoup(value)
    for code in soup.findAll('table', {'class': 'highlighttable'}):
        try:
            new_code = Code.objects.get(id=int(code['id']))
            code.replaceWith('<code lang="%s">%s</code>' % (new_code.lang, new_code.code))
        except Code.DoesNotExist:
            pass
    for user in soup.findAll('a'):
        try:
            if 'user_tag' in user['class'].split(' '):
                user.replaceWith("<user>%s</user>" % user.string)
        except Exception:
            pass
    for quote in soup.findAll('div', {'class': 'quote'}):
        text = quote.__unicode__().replace('<div class="quote">', '')
        text = ''.join(text.split('</div>')[:-1])
        quote.replaceWith("<quote>%s</quote>" % text)
    for quote in soup.findAll('div', {'class': 'spoiler'}):
        text = quote.__unicode__().replace('<div class="spoiler">', '')
        text = ''.join(text.split('</div>')[:-1])
        quote.replaceWith("<spoiler>%s</spoiler>" % text)
    return soup.renderContents().decode('utf8').replace('</fcut>', '').replace('</cut>', '')
class Transformer(object):

    def __init__(self, html, destdir, context):
        self.destdir = destdir
        self.soup = BeautifulSoup(html)
        self.context = context
        self.images = self._collectImages()

    def __call__(self, transformations):
        for transform in transformations:
            method = TRANSFORMATIONS.get(transform)
            params = dict(context=self.context,
                          request=self.context.REQUEST,
                          destdir=self.destdir,
                          images=self.images)
            if method is None:
                raise ValueError('No transformation "%s" registered' % transform)

            method(self.soup, params)

    def render(self):
        return self.soup.renderContents()

    def _collectImages(self):
        """ Collect paths of all images within subtree """
        images = list()
        for brain in self.context.portal_catalog(portal_type='Image',
                                                 path='/'.join(self.context.getPhysicalPath())):
            images.append(brain.getPath())
        return images
def track_links(content, context):
    """Convert all links in the template for the user
    to track his navigation"""
    if not context.get('uidb36'):
        return content

    soup = BeautifulSoup(content)
    for link_markup in soup('a'):
        if link_markup.get('href') and \
               'no-track' not in link_markup.get('rel', ''):
            link_href = link_markup['href']
            link_title = link_markup.get('title', link_href)
            link, created = Link.objects.get_or_create(
                url=link_href, defaults={'title': link_title})
            link_markup['href'] = 'http://%s%s' % (
                context['domain'],
                reverse('newsletter_newsletter_tracking_link',
                        args=[
                            context['newsletter'].slug, context['uidb36'],
                            context['token'], link.pk
                        ]))
    if USE_PRETTIFY:
        return soup.prettify()
    else:
        return soup.renderContents()
Beispiel #29
0
def sanitize_html(value):
    from BeautifulSoup import BeautifulSoup, Comment, Tag

    # FIXME: 'None' should never be saved as text
    if value is None:
        return ""

    # allowed tags for a Vodafone Live <CONTAINER type="data" />
    # this doubles up as a translation table. CKEditor does new-ish
    # HTML than Vodafone Live will accept. We have to translate 'em' back
    # to 'i', and 'strong' back to 'b'.
    #
    # NOTE: Order is important since <strong>'s can be inside <p>'s.
    tags = (
        ("em", "i"),  # when creating them in the editor they're EMs
        ("strong", "b"),
        ("i", "i"),  # when loading them as I's the editor leaves them
        ("b", "b"),  # we keep them here to prevent them from being removed
        ("u", "u"),
        ("br", "br"),
        ("p", "p"),
    )
    valid_tags = [tag for tag, replacement_tag in tags]
    soup = BeautifulSoup(value)

    # remove all comments from the HTML
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()

    # hide all tags that aren't in the allowed list, but keep
    # their contents
    for tag in soup.findAll(True):
        # Vodafone Live allows for no tag attributes
        tag.attrs = []
        if tag.name not in valid_tags:
            tag.hidden = True

    # replace tags with Vlive equivelants
    for element, replacement_element in tags:
        if element is not replacement_element:
            for tag in soup.findAll(element):
                replacement_tag = Tag(soup, replacement_element)
                replacement_tag.insert(0, tag.text)
                tag.replaceWith(replacement_tag)

    xml = soup.renderContents().decode("utf8")
    fragment, errors = tidy_fragment(xml, {"char-encoding": "utf8"})

    return (
        fragment.replace("&nbsp;", " ")
        .replace("&rsquo;", "'")
        .replace("&lsquo;", "'")
        .replace("&quot;", '"')
        .replace("&ldquo;", '"')
        .replace("&rdquo;", '"')
        .replace("&bull;", "- ")
        .replace("&eacute;", "e")
        .replace("&Eacute;", "E")
        .replace("&ndash;", "-")
    )
 def __getData(self, post):
     page = {}
     try:
         post_tag = BeautifulSoup(post.__str__().replace('/>>','/>'))
         table_tag = post_tag.find('table')
         if table_tag:
             table_tag.extract()
         try:    
             page['data'] = stripHtml(post_tag.renderContents())
             page['title']= ''
         except:
             log.exception(self.log_msg('Data not found for the url %s'%self.currenturi))
             return        
     
         try:
             date_str = stripHtml(table_tag.findAll('strong')[-1].renderContents())
             page['posted_date'] = datetime.strftime(datetime.\
                                     strptime(re.sub("(\d+)(st|nd|rd|th)",r"\1",date_str).\
                                     strip(),"%d %B %Y"),"%Y-%m-%dT%H:%M:%SZ")             
         except:
             log.exception(self.log_msg('Posted date not found'))
             page['posted_date'] = datetime.strftime(datetime.utcnow(), "%Y-%m-%dT%H:%M:%SZ")
         try:
             page['et_author_name'] = stripHtml(table_tag.findAll('strong')[0].renderContents())
         except:
             log.exception(self.log_msg('author name not found'))
     except:
         log.exception(self.log_msg('post tag not found'))        
      
     return page                                                                                                                                                                                      
Beispiel #31
0
def extractLinks(postSoup):
    linkSoup = BeautifulSoup()
    for tag in postSoup.findAll("a"):
        if "href" in tag:
            linkSoup.insert(len(linkSoup), tag["href"])
    
    return linkSoup.renderContents()
Beispiel #32
0
def sanitize_html(value, allowed):
    """
    Strips all [X]HTML tags except the list of tags and attributes passed
    by the filter.

    {{ element_to_cleanse|sanitize:"strong em a p,href src" }}
    """
    tags, attrs = allowed.split(',')
    valid_tags = tags.split()
    valid_attrs = attrs.split()
    soup = BeautifulSoup(value)
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in valid_tags:
            tag.hidden = True
        if len(valid_attrs) != 0:
            tag.attrs = [(attr, val) for attr, val in tag.attrs
                         if attr in valid_attrs]
        else:
            tag.attrs = [(attr, val) for attr, val in tag.attrs]
    javascript_re = re.compile(
        'j[\s]*(&#x.{1,7})?a[\s]*(&#x.{1,7})?v[\s]*(&#x.{1,7})?a[\s]*(&#x.{1,7})?s[\s]*(&#x.{1,7})?c[\s]*(&#x.{1,7})?r[\s]*(&#x.{1,7})?i[\s]*(&#x.{1,7})?p[\s]*(&#x.{1,7})?t[\s]*(&#x.{1,7})?:',
        re.IGNORECASE)
    return javascript_re.sub('', soup.renderContents().decode('utf8'))
Beispiel #33
0
def removecut(string):
    soup = BeautifulSoup(string, selfClosingTags=['img', 'br'])
    tag = soup.find('yvcut')
    if not tag: return string
    tag.extract()
    string = soup.renderContents()
    return string
Beispiel #34
0
    def lyrics(self, mess, args):
        """Fetches lyrics from the given artist and song.
        !lyrics Rick Astley : Never Gonna Give You Up"""
        try:
            artist, title = args.split(':')
        except ValueError:
            return 'usage: !lyrics artist : title'

        self.send(mess.getFrom(), '/me is looking for your lyrics...', message_type='groupchat')

        artist = artist.strip().replace(' ', '_').title()
        title = title.strip().replace(' ', '_').title()

        artist = urllib.quote(artist)
        title = urllib.quote(title)

        lyrics = urllib.urlopen('http://lyricwiki.org/%s:%s' % (artist, title))
        text = lyrics.read()
        soup = BeautifulSoup(text)
        lyrics = soup.findAll(attrs={'class': 'lyricbox'})

        if lyrics:
            lyrics = lyrics[0].renderContents()
            lyrics = lyrics.replace('<br />', '\n')
            lyrics = re.sub('<[^<]*?/?>', '', lyrics)                        # strip html tags
            lyrics = re.sub('<!--.*-->', '', lyrics, flags=re.DOTALL)        # strip html comments
            lyrics = re.sub(' ?Send.*?Ringtone to your Cell ?', '', lyrics)  # strip ads

            # parse HTML entities
            entities = BeautifulSoup(lyrics, convertEntities=BeautifulSoup.HTML_ENTITIES)
            return entities.renderContents()
        else:
            return 'Lyrics not found.'
class Transformer(object):
    def __init__(self, html, destdir, context):
        self.destdir = destdir
        self.soup = BeautifulSoup(html)
        self.context = context
        self.images = self._collectImages()

    def __call__(self, transformations):
        for transform in transformations:
            method = TRANSFORMATIONS.get(transform)
            params = dict(context=self.context,
                          request=self.context.REQUEST,
                          destdir=self.destdir,
                          images=self.images)
            if method is None:
                raise ValueError('No transformation "%s" registered' %
                                 transform)

            method(self.soup, params)

    def render(self):
        return self.soup.renderContents()

    def _collectImages(self):
        """ Collect paths of all images within subtree """
        images = list()
        for brain in self.context.portal_catalog(
                portal_type='Image',
                path='/'.join(self.context.getPhysicalPath())):
            images.append(brain.getPath())
        return images
Beispiel #36
0
def linkify(text, word_dic):
    """
    take a html text and replace words that match a key in a dictionary with
    the associated value, return the changed text.
    Filter should be run after markup filter and syntax hightlight filter
    """

    VALID_TAGS = ['p', 'li']

    def translate(match):
        key = match.group(0)
        value = word_dic[key.lower()]
        # only first keyword must be linkified
        if key in linkified_words:
            return key
        else:
            linkified_words.append(key)
            return value

    list_dict = map(re.escape, word_dic)
    list_dict2 = ['\\b%s\\b' % x for x in list_dict]
    re_str = '|'.join(list_dict2)
    rc = re.compile(re_str, re.IGNORECASE)
    soup = BeautifulSoup(text)
    textNodes = soup.findAll(text=True)

    linkified_words = []
    for textNode in textNodes:
        parent = textNode.findParent()
        if parent.name in VALID_TAGS:
            urlifiedtext = rc.sub(translate, textNode)
            textNode.replaceWith(urlifiedtext)
    return soup.renderContents()
Beispiel #37
0
def crawl(url, prevLevel=0):
    if prevLevel > 1:
        return None
    try:
        page = urllib2.urlopen(url)
    except (urllib2.URLError, ValueError):
        return None

    try:
        soup = BeautifulSoup(page)
    except UnicodeEncodeError:
        return None
    root = {}
    root["url"] = url
    root["children"] = []

    anchors = soup.findAll('a')
    for a in anchors:
        link = a.get('href')
        if link is not None:
            child = crawl(a['href'], prevLevel + 1)
            if child is not None:
                print child["url"]
                root["children"].append(child)

    root["content"] = soup.renderContents()
    return root
Beispiel #38
0
def sanitize_html(value, valid_tags=VALID_TAGS, base_url=None):
    """Remove all html tags except VALID_TAGS"""
    rjs = r'[\s]*(&#x.{1,7})?'.join(list('javascript:'))
    rvb = r'[\s]*(&#x.{1,7})?'.join(list('vbscript:'))
    re_scripts = re.compile('(%s)|(%s)' % (rjs, rvb), re.IGNORECASE)
    validTags = valid_tags.split()
    validAttrs = 'href src width height'.split()
    urlAttrs = 'href src'.split()  # Attributes which should have a URL
    soup = BeautifulSoup(value)
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        # Get rid of comments
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in validTags:
            tag.hidden = True
        attrs = tag.attrs
        tag.attrs = []
        for attr, val in attrs:
            if attr in validAttrs:
                val = re_scripts.sub('', val)  # Remove scripts (vbs & js)
                if attr in urlAttrs:
                    val = urljoin(base_url, val)  # Calculate the absolute url
                tag.attrs.append((attr, val))

    return soup.renderContents().decode('utf8')
Beispiel #39
0
def extractLinks(postSoup):
    linkSoup = BeautifulSoup()
    for tag in postSoup.findAll("a"):
        if "href" in tag:
            linkSoup.insert(len(linkSoup), tag["href"])

    return linkSoup.renderContents()
Beispiel #40
0
def linkify(text, word_dic):
    """
    take a html text and replace words that match a key in a dictionary with
    the associated value, return the changed text.
    Filter should be run after markup filter and syntax hightlight filter
    """

    VALID_TAGS = ['p', 'li']

    def translate(match):
        key = match.group(0)
        value = word_dic[key.lower()]
        # only first keyword must be linkified
        if key in linkified_words:
            return key
        else:
            linkified_words.append(key)
            return value

    list_dict = map(re.escape, word_dic)
    list_dict2 = ['\\b%s\\b' % x for x in list_dict]
    re_str = '|'.join(list_dict2)
    rc = re.compile(re_str, re.IGNORECASE)
    soup = BeautifulSoup(text)
    textNodes = soup.findAll(text=True)

    linkified_words = []
    for textNode in textNodes:
        parent = textNode.findParent()
        if parent.name in VALID_TAGS:
            urlifiedtext = rc.sub(translate, textNode)
            textNode.replaceWith(urlifiedtext)
    return soup.renderContents()
Beispiel #41
0
def remove_empty_html_tag(text, tag, extract=[]):
    """
    check if the tag is empty, if yes remove it
    spaces, &nbsp; will be treated as empty
    for further extracting elements you can pass them in the extract list
    
    >>> text = "<p> valid p tags</p><p> <br/> <br/> </p><p>&nbsp;    </p>"
    >>> remove_empty_html_tag(text,'p')
    u'<p> valid p tags</p>'
    >>>
    >>> text = "<p> valid p tags</p><p> <em>junk01</em><br/> <em>more junk</em><br/> <strong>%^$#@</strong></p><p>&nbsp;    </p>"
    >>> remove_empty_html_tag(text,'p', ['em'])
    u'<p> valid p tags</p><p> <br /> <br /> <strong>%^$#@</strong></p>'
    >>> remove_empty_html_tag(text,'p', ['strong'])
    u'<p> valid p tags</p><p> <em>junk01</em><br /> <em>more junk</em><br /> </p>'
    >>> remove_empty_html_tag(text,'p', ['strong', 'em'])
    u'<p> valid p tags</p>' 
    >>> text = "<table><tbody><tr><td>&nbsp;</td></tr><tr><td><em>Agni-III</em></td></tr></tbody></table>"
    >>> remove_empty_html_tag(text, 'table', ['em'])
    u''
    >>> text = "<table><tbody><tr><td>&nbsp;</td></tr><tr><td>Some other text<em>Agni-III</em></td></tr></tbody></table>"
    >>> remove_empty_html_tag(text, 'table', ['em'])
    u'<table><tbody><tr><td>&nbsp;</td></tr><tr><td>Some other text</td></tr></tbody></table>'
    >>> 

    """
    soup = BeautifulSoup((text))
    for s in soup.findAll(re.compile(tag)):
        tmp = s
        
        # extract the extra elements before doing a null check
        for i in extract:
            [x.extract() for x in tmp.findAll(i)]
        # if null check is true, extract from the original soup
    return smart_unicode(soup.renderContents())
Beispiel #42
0
 def get_page(self, url, ignore_cache=False):
     if not isinstance(url, str):
         try: url = str(url)
         except: raise TypeError
     cached_content = ''
     if self.common.uses_cache() and not ignore_cache:
         cached_content = self.common.check_cache_expired(url)
     if cached_content:
         return cached_content
     else:
         try:
             cookies = self.net.set_cookies(self.common.cookie_file)
             content = self.net.http_GET(url, {'Referer': url}).content
             cookies = self.net.save_cookies(self.common.cookie_file)
             if isinstance(content, unicode):
                 content.encode('utf-8')
             soup = BeautifulSoup(content, convertEntities=BeautifulSoup.HTML_ENTITIES)
             return_content = soup.renderContents()
             if self.common.uses_cache() and not ignore_cache:
                 self.common.add_to_cache(url, return_content)
             return return_content
         except urllib2.HTTPError, e:
             self.error(e)
             return ''
         except (urllib2.URLError, httplib.HTTPException, AttributeError, ValueError, Exception), e:
             self.error(e)
             return ''
Beispiel #43
0
def sanitizeHtml(value, base_url=None):
    """
    Allow only whitelisted tags. I have changed this method slightly to allow
    defining tag whitelist in project's settings module.
    
    @see: http://stackoverflow.com/questions/16861/sanitising-user-input-using-python/25136#25136
    """
    rjs = r'[\s]*(&#x.{1,7})?'.join(list('javascript:'))
    rvb = r'[\s]*(&#x.{1,7})?'.join(list('vbscript:'))
    re_scripts = re.compile('(%s)|(%s)' % (rjs, rvb), re.IGNORECASE)
    validTags = settings.VALID_TAGS
    validAttrs = settings.VALID_ATTRS
    urlAttrs = settings.URL_ATTRS
    soup = BeautifulSoup(value)
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        # Get rid of comments
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in validTags:
            tag.hidden = True
        attrs = tag.attrs
        tag.attrs = []
        for attr, val in attrs:
            if attr in validAttrs:
                val = re_scripts.sub('', val) # Remove scripts (vbs & js)
                if attr in urlAttrs:
                    val = urljoin(base_url, val) # Calculate the absolute url
                tag.attrs.append((attr, val))

    return soup.renderContents().decode('utf8')
Beispiel #44
0
def sanitizeHtml(value, base_url=None):
    """ From an example on StackOverflow 
        http://stackoverflow.com/questions/16861/sanitising-user-input-using-python
    """
    rjs = r'[\s]*(&#x.{1,7})?'.join(list('javascript:'))
    rvb = r'[\s]*(&#x.{1,7})?'.join(list('vbscript:'))
    re_scripts = re.compile('(%s)|(%s)' % (rjs, rvb), re.IGNORECASE)
    validTags = 'p i strong b u a h1 h2 h3 pre br img'.split()
    validAttrs = 'href src width height'.split()
    urlAttrs = 'href src'.split() # Attributes which should have a URL
    soup = BeautifulSoup(value)
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        # Get rid of comments
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in validTags:
            tag.hidden = True
        attrs = tag.attrs
        tag.attrs = []
        for attr, val in attrs:
            if attr in validAttrs:
                val = re_scripts.sub('', val) # Remove scripts (vbs & js)
                if attr in urlAttrs:
                    val = urljoin(base_url, val) # Calculate the absolute url
                tag.attrs.append((attr, val))

    return soup.renderContents().decode('utf8')
def check_html(value, environ):
    """
    This function does the actual validation.
    environ must have 'allowed_tags' and
    'allowed_attributes' in it.
    
    Removes unwanted tags, attributes and
    comments.
    
    Value should be the string to be validated.
    """
    if type(value) != unicode:
        try:
            value = unicode(value)
        except UnicodeDecodeError:
            raise InvalidTiddlerError('HTML Validation Failed: contents of tiddler not a valid string.')
    
    url_regex = re.compile(r'[\s]*(&#x.{1,7})?'.join(list('javascript:'))) 
    
    soup = BeautifulSoup(value)
    
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)): 
        comment.extract()                                        
    
    for tag in soup.findAll(True):
        if tag.name not in environ['tiddlyweb.config']['allowed_tags']:
            tag.hidden = True
        tag.attrs = [(attr, url_regex.sub('', val)) for attr, val in tag.attrs
            if attr in environ['tiddlyweb.config']['allowed_attributes']]
                     
    return soup.renderContents().decode('utf8')
Beispiel #46
0
def isohunt_search(q):
    #Query the isohunt search engine and get the results HTML
    q = urllib.quote(q)
    soup = Soup(open_url('http://isohunt.com/torrents/?ihq=%s' % q),
                convertEntities='html',
                markupMassage=hexentityMassage)
    anchors = select(soup, 'a[id^link]')
    anchors = filter(lambda a: a.parent.name == 'td', anchors)
    results = {}
    for a in anchors:
        if str(a.contents[0]) != '0':
            a = Soup(a.renderContents().split("<br />").pop())
            result = ' '.join([
                unicode(node.renderContents())
                if type(node) != NavigableString else unicode(node)
                for node in a.contents
            ])
            result = scene_cleanup(result)
            if result not in results.keys():
                results[result] = 1
            else:
                results[result] += 1

    results = sorted(results.iteritems(), key=operator.itemgetter(1))
    res = []
    for r in results:
        res = [r[0]] + res
    return res
def AI(jid,query,querer,group):
	taken=["wiki","google","image","auth code"]
	for x in taken:
		if query.lower().startswith(x):
			return
	#global therapist
	#reply = therapist.respond(query)
	global botsessions
	global bot1
	if jid not in botsessions:
		botsessions[jid]=bot1.create_session()
	reply = botsessions[jid].think(query)
	
	VALID_TAGS = ['br']
    	soup = BeautifulSoup(reply)

	for tag in soup.findAll(True):
		if tag.name not in VALID_TAGS:
			tag.hidden = True

	reply=soup.renderContents()
	reply=reply.replace('<br />','\n')
	
	if group:
		if reply!=".":
			modules.sender.message_queue(jid,reply)
	else:
		modules.sender.message_queue(jid,reply)
Beispiel #48
0
def sanitise(value):
    whitelist = [
        'a:title:href', 'abbr:title', 'acronym:title', 'address',
        'blockquote:cite', 'br', 'caption', 'center', 'cite:url', 'code',
        'dd', 'del:cite:datetime', 'dfn', 'dl', 'dt', 'em', 'h1:id', 'h2:id',
        'h3:id', 'h4:id', 'h5:id', 'h6:id', 'hr', 'img:src:alt:width:height',
        'ins:cite:datetime', 'kbd', 'li', 'ol', 'p', 'pre', 'q:cite', 'samp',
        'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead',
        'tr', 'ul', 'var',
    ]
    
    js_regex = re.compile(r'[\s]*(&#x.{1,7})?'.join(list('javascript')))
    allowed_tags = [tag.split(':') for tag in whitelist]
    allowed_tags = dict((tag[0], tag[1:]) for tag in allowed_tags)
    
    soup = BeautifulSoup(value)
    for comment in soup.findAll(text=lambda text: isinstance(text, BSComment)):
        comment.extract()
    
    for tag in soup.findAll(True):
        if tag.name not in allowed_tags:
            tag.hidden = True
        else:
            tag.attrs = [(attr, js_regex.sub('', val)) for attr, val in tag.attrs
                         if attr in allowed_tags[tag.name]]
    
    return soup.renderContents().decode('utf8')
Beispiel #49
0
    def lyrics(self, mess, args):
        """Fetches lyrics from the given artist and song.
        !lyrics Rick Astley : Never Gonna Give You Up"""
        try:
            artist, title = args.split(':')
        except ValueError:
            return 'usage: !lyrics artist : title'

        artist = artist.strip().replace(' ', '_').title()
        title = title.strip().replace(' ', '_').title()

        artist = urllib.quote(artist)
        title = urllib.quote(title)

        lyrics = urllib.urlopen('http://lyricwiki.org/%s:%s' % (artist, title))
        text = lyrics.read()
        soup = BeautifulSoup(text)
        lyrics = soup.findAll(attrs={'class': 'lyricbox'})

        if lyrics:
            lyrics = lyrics[0].renderContents()
            lyrics = lyrics.replace('<br />', '\n')
            lyrics = re.sub('<[^<]*?/?>', '', lyrics)  # strip html tags
            lyrics = re.sub('<!--.*-->', '', lyrics,
                            flags=re.DOTALL)  # strip html comments
            lyrics = re.sub(' ?Send.*?Ringtone to your Cell ?', '',
                            lyrics)  # strip ads

            # parse HTML entities
            entities = BeautifulSoup(
                lyrics, convertEntities=BeautifulSoup.HTML_ENTITIES)
            return entities.renderContents()
        else:
            return 'Lyrics not found.'
Beispiel #50
0
def sanitizeHtml(value, base_url=None):
    """
    Allow only whitelisted tags. I have changed this method slightly to allow
    defining tag whitelist in project's settings module.
    
    @see: http://stackoverflow.com/questions/16861/sanitising-user-input-using-python/25136#25136
    """
    rjs = r'[\s]*(&#x.{1,7})?'.join(list('javascript:'))
    rvb = r'[\s]*(&#x.{1,7})?'.join(list('vbscript:'))
    re_scripts = re.compile('(%s)|(%s)' % (rjs, rvb), re.IGNORECASE)
    validTags = settings.VALID_TAGS
    validAttrs = settings.VALID_ATTRS
    urlAttrs = settings.URL_ATTRS
    soup = BeautifulSoup(value)
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        # Get rid of comments
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in validTags:
            tag.hidden = True
        attrs = tag.attrs
        tag.attrs = []
        for attr, val in attrs:
            if attr in validAttrs:
                val = re_scripts.sub('', val) # Remove scripts (vbs & js)
                if attr in urlAttrs:
                    val = urljoin(base_url, val) # Calculate the absolute url
                tag.attrs.append((attr, val))

    return soup.renderContents().decode('utf8')
Beispiel #51
0
def sanitize(value, extra_filters=None):
    """
    Sanitize the given HTML.

    Based on code from:
    * http://www.djangosnippets.org/snippets/1655/
    * http://www.djangosnippets.org/snippets/205/
    """
    if value is None:
        return u''

    if '<' not in value and '&#' not in value and \
            re.search(r'&\w+;', value) is None: # no HTML
        # convert plain-text links into HTML
        return mark_safe(urlize(value, nofollow=True, autoescape=True))

    js_regex = re.compile(r'[\s]*(&#x.{1,7})?'.join(list('javascript')),
                          re.IGNORECASE)
    allowed_tags = ('p i strong em b u a h1 h2 h3 h4 h5 h6 pre br img ul '
                    'ol li span').split()
    allowed_attributes = 'href src style'.split()

    whitelist = False
    extra_tags = ()
    extra_attributes = ()
    if isinstance(extra_filters, basestring):
        if '|' in extra_filters:
            parts = extra_filters.split('|')
        else:
            parts = [extra_filters.split()]
        if parts[0] == 'whitelist':
            whitelist = True
            parts = parts[1:]
        extra_tags = parts[0].split()
        if len(parts) > 1:
            extra_attributes = parts[1].split()
    elif extra_filters:
        extra_tags = extra_filters

    if whitelist:
        allowed_tags, allowed_attributes = extra_tags, extra_attributes
    else:
        allowed_tags = set(allowed_tags) - set(extra_tags)
        allowed_attributes = set(allowed_attributes) - set(extra_attributes)

    soup = BeautifulSoup(value)
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        # remove comments
        comment.extract()

    for tag in soup.findAll(True):
        if tag.name not in allowed_tags:
            tag.hidden = True
        else:
            tag.attrs = [(attr, js_regex.sub('', val))
                         for attr, val in tag.attrs
                         if attr in allowed_attributes]

    return mark_safe(soup.renderContents().decode('utf8'))
def code2tree_v2(code):
    soup = BeautifulSoup(code)
    #code = code.decode("utf-8")
    #code = unicode(soup.prettify(encoding="UTF-8"))
    code = unicode(soup.renderContents(), errors='ignore')
    f = io.StringIO(code)
    tree = html.parse(f)
    return tree
Beispiel #53
0
def href_filter(soup):
    hrefs = soup.findAll('a')
    for href in hrefs:
        body = BeautifulSoup(href.renderContents())
        if len(body.contents) == 1:
            href.replaceWith(body.contents[0])
        else:
            href.replaceWith(body.renderContents())
Beispiel #54
0
def sanitize_html(htmlSource,
                  encoding=None,
                  type="text/html",
                  valid_tags=None,
                  valid_styles=None):
    """
    Clean bad html content. Currently this simply strips tags that
    are not in the VALID_TAGS setting.
    
    This function is used as a replacement for feedparser's _sanitizeHTML
    and fixes problems like unclosed tags and gives finer grained control
    over what attributes can appear in what tags.

    Returns the sanitized html content.
    """
    if valid_tags is None:
        valid_tags = getattr(settings, "LIFESTREAM_VALID_TAGS", VALID_TAGS)
    if valid_styles is None:
        valid_styles = getattr(settings, "LIFESTREAM_VALID_STYLES",
                               VALID_STYLES)

    js_regex = re.compile(r'[\s]*(&#x.{1,7})?'.join(list('javascript')))
    css_regex = re.compile(r' *(%s): *([^;]*);?' % '|'.join(valid_styles),
                           re.IGNORECASE)

    # Sanitize html with BeautifulSoup
    if encoding:
        soup = BeautifulSoup(htmlSource, fromEncoding=encoding)
    else:
        soup = BeautifulSoup(htmlSource)

    # Remove html comments
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()

    for tag in soup.findAll(True):
        if tag.name not in valid_tags:
            tag.hidden = True
        else:
            tag.attrs = [(attr, js_regex.sub('', val))
                         for attr, val in tag.attrs
                         if attr in valid_tags[tag.name]]

    # Strip disallowed css tags.
    for tag in soup.findAll(attrs={"style": re.compile(".*")}):
        style = ""
        for key, val in css_regex.findall(tag["style"]):
            style += "%s:%s;" % (key, val.strip())
        tag["style"] = style

    # Sanitize html text by changing bad text to entities.
    # BeautifulSoup will do this for href and src attributes
    # on anchors and image tags but not for text.
    for text in soup.findAll(text=True):
        text.replaceWith(escape_entities(text))

    # Strip disallowed tags and attributes.
    return soup.renderContents().decode('utf8')
def markdownify(rendered_template):
    '''
    parse a rendered temaplte for tags that have class="md" and replace the
    contents of the tag withthe marked up results.
    '''
    html = BeautifulSoup(rendered_template)
    for md in html.findAll('', 'md'):
        md.contents = BeautifulSoup(markdown(md.renderContents()))
    return html.renderContents()
Beispiel #56
0
 def preview_html(self):
     html = self._render_output_html()
     html = self._personalize({}, html)
     for placeholder in PLACEHOLDERS:
         html = html.replace('[[' + placeholder + ']]', '')
     soup = BeautifulSoup(html)
     for node in soup.findAll('div', {'class': 'mailonly'}):
         node.extract()
     return soup.renderContents()
def _remove_bold_italic(text):
    TAGS_OUT = ['em', 'i', 'u', 'b', 'strong']

    soup = BeautifulSoup(text)

    for tag in soup.findAll(True):
        if tag.name in TAGS_OUT:
            tag.hidden = True

    return soup.renderContents().decode('utf-8')