Beispiel #1
0
def pygmentify(html):
    soup = BeSoup(html, convertEntities=BeSoup.HTML_ENTITIES)
    # get code blocks
    code_bits = soup.findAll('pre')
    for code in code_bits:
        pre = code.contents[0]
        replace_me = ''.join([k.__unicode__() for k in pre.contents])
        code.replaceWith(code_markup(replace_me))
    return soup.__unicode__()
def getBeautyTitle(body):
     text = strip_tags(body)
     temptext = ""
     #remove all the links
     soup = BeautifulSoup(urlize(text))
     for link in soup.findAll("a"):
               link.extract()
            
     newtext = soup.__unicode__().replace("."," ").strip()
     ishypen = newtext.rfind("-")
            
     #for removing the last hypen associated with words
     if ishypen == -1 :
         temptext = newtext
     elif ishypen+1 == len(newtext) :
         temptext = newtext[:ishypen]
     else :
         temptext = newtext
              
              
     #remove tags  , @ symbols etc.........from the tweet
     words = temptext.split()
     finaltext = ""
          
     for word in words:
        newword = word.strip()
        if len(newword) > 0 :
            if newword[0] == "#":
                continue
             
            if newword[0] == "@":
                continue
                
            if newword.lower() == "rt":
                continue
              
            finaltext = finaltext + newword + " "
        
          
     finaltext = finaltext.strip()
          
     if len(finaltext)  == 0 :
              return "Title missing"
            
     if finaltext[0] == "-":
        return finaltext[1:] 
     
            
            
     return finaltext
Beispiel #3
0
def replace_img_alt(html, alt=""):
    """
    function: 替换html中所有img的alt内容
    params:
        html - 原始html内容
        alt - 要替换的alt内容
    return 替换过后的html(unicode)
    """
    if (type(html) == str) or (type(html) == unicode):
        html = BeautifulSoup(html)
    images = html.findAll("img")
    for im in images:
        im['alt'] = alt
        
    return html.__unicode__()
Beispiel #4
0
def article_clear(html):
    ''' 去除 '''

    #html = html.replace(' ', ' ')
    soup = BeautifulSoup(html)

    # 清除字体,行间距,字体大小
    r_clear_style = re.compile(
        '(\s?color: rgb\(0, 0, 0\)[^;]*;?|\s?font-family[^;]*;?|\s?line-height[^;]*;?|\s?font-size[^;]*(medium|small|large);?|\s?white-space[^;]*;?)'
    )
    ss = soup.findAll(attrs={'style': r_clear_style})
    for s in ss:
        s['style'] = r_clear_style.sub("", s['style'])

    # 去除所有class
    ss = soup.findAll()
    for s in ss:
        if s.string is not None:
            s.string = s.text.strip()
        del s['class']

    return soup.__unicode__()
Beispiel #5
0
class HtmlParser(object):
    """
    Clean incoming text from html tags and tag attributes.
    """

    def __init__(self, text, **kwargs):
        if not isinstance(text, basestring):
            text = u''
        self.text = text.strip()
        self.soup = BeautifulSoup(self.text)
        self.allowed_tags = []
        self.allowed_attrs = []
        self.max_word_length = 20

    def config(self, **kwargs):
        if 'allowed_tags' in kwargs: self.allowed_tags = kwargs.get('allowed_tags')
        if 'allowed_attrs' in kwargs: self.allowed_attrs = kwargs.get('allowed_attrs')
        if 'max_word_length' in kwargs: self.max_word_length = kwargs.get('max_word_length', None)
        return self

    def clean_tags(self):
        to_delete = []

        for tag in self.soup.contents:
            self._clean_tag(tag, to_delete)

        for tag in to_delete:
            tag.extract()

        for tag in self.soup.findAll(True):
            if hasattr(tag, 'attrs'):
                tag.attrs = [attr for attr in tag.attrs if attr[0] in self.allowed_attrs]

        self.text = self.soup.__unicode__()

    def _clean_tag(self, tag, to_delete):
        operation = self._check_tag(tag)
        if not operation == 'delete':
            for _tag in tag.contents:
                self._clean_tag(_tag, to_delete)
            if operation == 'delete_safe':
                to_append = []
                for _tag in tag.contents:
                    to_append.append(_tag)
                for _tag in to_append:
                    tag.parent.insert(self._find_position(tag),_tag)
                del to_append
        if operation in ('delete', 'delete_safe'):
            to_delete.append(tag)

    def _find_position(self, tag):
        for i,j in enumerate(tag.parent.contents):
            if j == tag:
                return i
        return 0

    def _check_tag(self, tag):
        if not hasattr(tag, 'contents'):
            tag.contents = []
        if isinstance(tag, Comment):
            return 'delete'
        if isinstance(tag, Tag) and not tag.name in self.allowed_tags:
            return 'delete_safe'
        return 'leave'

    def clean_trash(self):
        self.text = re.sub(r'^(<br type="_moz" />|&#160;|<br />|&nbsp;|\s)*', '', self.text)
        self.text = re.sub(r'&#160;', ' ', self.text)

    def clean_spaces(self):
        reg = re.compile(u'\s{2,}', re.UNICODE)
        self.text = reg.sub('', self.text)

    def clean_max_word_length(self):
        self.text = fill(self.text, self.max_word_length)

    def transform_urls(self):
        """
        Finds all URLs like http://domain/adress/?params
        and replaces it with
        <a href="http://domain/adress/?params">http://domain/adress/?params</a>
        """

        def replace(text):
            """
            Replace URLS
            """
            r1 = r"(\b(http|https)://([-A-Za-z0-9+&@#/%?=~_()|!:,.;]*[-A-Za-z0-9+&@#/%=~_()|]))"
            #r2 = r"((^|\b)www\.([-A-Za-z0-9+&@#/%?=~_()|!:,.;]*[-A-Za-z0-9+&@#/%=~_()|]))"
            #return re.sub(r2,r'<a rel="nofollow" target="_blank" href="http://\1">\1</a>', \
            #    text)
            return re.sub(r1,r'<a rel="nofollow" target="_blank" href="\1">\1</a>',text)

        for tag in self.soup.findAll(text=True):
            if len(tag.findParents(name='a')) == 0:
                tag.replaceWith(NavigableString(replace(unicode(tag))))
        return self.soup.__unicode__()



# TODO: complete this function.
#    def cut(self, *args):
#        """
#        Html-safe cutting the text (get a part of text of defined length, where length of html tags is not included).
#        """
#        length = 0
#        to_delete = []
#        for tag in self.soup.findAll(True):
#            if isinstance(tag, NavigableString):
#                if length + len(tag.__unicode__()) > self.max_word_length:
#                    next = tag.next
#                    while next:
#                        to_delete.append(next)
#                        next = next.next
#
#
#        #start, length = len(args) == 2 and (args[0], args[1]) or (0, args[0])
#        #self.text = self.text[start: start+length]
#        return self

    def get_text(self):
        return self.text

    def clean(self):
        #self.clean_trash()
#        self.clean_max_word_length()
        self.clean_tags()
        self.clean_spaces()
        return self

    def __unicode__(self):
        return unicode(self.text)

    def __str__(self):
        return self.__unicode__()