Ejemplos de Cleaner en Python, ejemplos de lxml.html.clean.Cleaner en Python

Ejemplo n.º 1

0

Mostrar archivo

def html_sanitize(src):
    if not src:
        return src
    src = ustr(src, errors='replace')

    # html encode email tags
    part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)",
                      re.IGNORECASE | re.DOTALL)
    src = part.sub(lambda m: cgi.escape(m.group(1)), src)

    # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
    try:
        cleaner = clean.Cleaner(page_structure=True,
                                style=False,
                                safe_attrs_only=False,
                                forms=False,
                                kill_tags=tags_to_kill,
                                remove_tags=tags_to_remove)
        cleaned = cleaner.clean_html(src)
    except TypeError:
        # lxml.clean version < 2.3.1 does not have a kill_tags attribute
        # to remove in 2014
        cleaner = clean.Cleaner(page_structure=True,
                                style=False,
                                safe_attrs_only=False,
                                forms=False,
                                remove_tags=tags_to_kill + tags_to_remove)
        cleaned = cleaner.clean_html(src)
    except Exception, e:
        if isinstance(e, etree.ParserError) and 'empty' in str(e):
            return ""
        _logger.warning('html_sanitize failed to parse %s' % (src))
        cleaned = '<p>Impossible to parse</p>'

Ejemplo n.º 2

0

Mostrar archivo

    def parse(self, response):
        title = response.xpath('//head/title/text()').extract_first()
        title = self.pattern.sub(" ",title).strip() if title != None else ""
        keywords = response.xpath('//head/meta[@name="keywords"]/@content').extract_first()
        keywords = self.pattern.sub(" ",keywords).strip() if keywords != None else ""
        description = response.xpath('//head/meta[@name="description"]/@content').extract_first()
        description = self.pattern.sub(" ",description).strip() if description != None else ""

        cleaner = clean.Cleaner(style=True,scripts=True,page_structure=False,safe_attrs_only=False)
        html1 = cleaner.clean_html(response.text)
        # print(response.url)
        doc = html.fromstring(html1)
        content =self.pattern.sub(" ",doc.xpath('//body')[0].text_content())
        #
        docset = dict()
        docset['title'] = title
        docset['keyword'] = keywords
        docset['description'] = description
        docset['content'] = content
        item = JsonSpiderItem()

        one = response.meta['one']
        item['name'] = one['name']
        item['nid'] = one['nid']
        item['keyword'] = one['keyword']
        item['kid'] = one['kid']
        item['requrl'] = one['requrl']
        item['rid'] = one['rid']
        item['title'] = one['title']
        item['baiduurl'] = one['baiduurl']
        item['realurl'] = one['realurl']
        item['abstract']= one['abstract']
        item['doc'] = docset
        yield item

Ejemplo n.º 3

0

Mostrar archivo

Archivo: hcom_email_cleaner.py Proyecto: raprasad/mcb-finance-utilities

    def pull_html_from_email_content(email_content):
        if email_content is None:
            return None

        idx = email_content.find('<html')
        if idx == -1:
            return None
        end_idx = email_content.find('</html>', idx + 5)
        if end_idx == -1:
            return None

        html_content = email_content[idx:end_idx + 7]

        # remove line breaks
        strings_to_remove = ['=\r\n', '<o:p>', '</o:p>']
        for to_remove in strings_to_remove:
            html_content = html_content.replace(to_remove, '')

        # remove <img ...> and <span...> tags
        html_content = removetags(html_content, 'p b img span')

        # remove extra attributes
        safe_attrs = clean.defs.safe_attrs
        clean.defs.safe_attrs = frozenset()
        cleaner = clean.Cleaner(safe_attrs_only=True)

        html_content = cleaner.clean_html(html_content)

        html_content = html_content.encode('ascii', 'ignore')
        return html_content

Ejemplo n.º 4

0

Mostrar archivo

Archivo: template.py Proyecto: bluec0re/python-docx_templating

def preprocess_html(context):
    if isinstance(context, dict):
        for key, value in context.items():
            context[key] = preprocess_html(value)
        return context
    elif isinstance(context, list):
        return [preprocess_html(v) for v in context]
    elif isinstance(context, tuple):
        return (preprocess_html(v) for v in context)
    elif isinstance(context, (str, unicode)):
        # clean html first
        cleaner = clean.Cleaner()
        cleaner.safe_attrs_only = True
        cleaner.safe_attrs = ('style', 'class')
        cleaner.allow_tags = ('p', 'a', 'br', 'span', 'strong', 'h1', 'h2',
                              'h3', 'h4', 'i', 'ul', 'li', 'br', 'pre')
        cleaner.remove_unknown_tags = False
        h = cleaner.clean_html(context)
        h = html.fromstring(h)

        # transform to docx code
        if h.find('p') is not None or h.find('span') is not None or\
                        h.find('strong') is not None or h.find('a') is not None:
            value = transform_html(h, True)
        else:
            # remove enclosing tag
            roottag = h.tag
            value = etree.tostring(h)
            value = value[len(roottag) + 2:-(len(roottag) + 3)]
        return value
    else:
        return context

Ejemplo n.º 5

0

Mostrar archivo

 def remove_html_attributes(string):
     html = lxml.html.fromstring(string)
     safe_attrs = clean.defs.safe_attrs
     cleaner = clean.Cleaner(safe_attrs_only=True, safe_attrs=frozenset())
     cleansed = cleaner.clean_html(html)
     string = lxml.html.tostring(cleansed)
     return re.sub(r'&#\d+;', '', string)

Ejemplo n.º 6

0

Mostrar archivo

    def clean_descriptions(selectors):
        selectors = selectors or []
        safe_attrs = set(['src', 'alt', 'href', 'title', 'width', 'height'])
        kill_tags = ['object', 'iframe']
        cleaner = clean.Cleaner(safe_attrs_only=True, safe_attrs=safe_attrs, kill_tags=kill_tags)

        return [cleaner.clean_html(selector.extract()).strip() for selector in selectors]

Ejemplo n.º 7

0

Mostrar archivo

Archivo: waybacktrack.py Proyecto: nimblemachine/crawl-to-the-future

def get_forwardlink_snapshots(parent_site):
    """
    @type index: string
    @param index: the index.html page from which to extract forward links

    @type year: int
    @param year: the year to extract archives from
    """
    try:
        parsed_parent_site = html.parse(ARCHIVE_DOMAIN + parent_site)
    except IOError:
        print "Did not get extract links in ", ARCHIVE_DOMAIN + parent_site
        return []

    #cleaner = html.clean.Cleaner(scripts=True, javascript=True,style=True, kill_tags = ["img"])
    cleaner = clean.Cleaner(scripts=True,
                            javascript=True,
                            comments=True,
                            style=True,
                            meta=True,
                            processing_instructions=True,
                            embedded=True,
                            frames=True,
                            forms=True,
                            kill_tags=["noscript", "iframe", "img"])

    parsed_parent_site = cleaner.clean_html(parsed_parent_site)

    # spec archival year
    # check to see if the archival year of a forwark link
    # is that of the parent (ie. 2000|2005|2010)
    all_forwardlinks = parsed_parent_site.xpath('//a[starts-with(@href,"' +
                                                parent_site[:9] + '")]/@href')

    return all_forwardlinks

Ejemplo n.º 8

0

Mostrar archivo

def sanitize_html(html):
    """Sanitize HTML

    :param str html: unsafe HTML markup
    :return str: sanitized HTML
    """
    if not html:
        return ""

    blacklist = ["script", "style", "head"]

    root_elem = lxml_html.fromstring(html)
    cleaner = clean.Cleaner(
        add_nofollow=False,
        kill_tags=blacklist
    )
    cleaned_xhtml = cleaner.clean_html(root_elem)

    safe_html = etree.tostring(cleaned_xhtml, encoding="unicode")

    # the following code is legacy (pre-lxml)
    if safe_html == ", -":
        return ""

    return safe_html

Ejemplo n.º 9

0

Mostrar archivo

def cleaner(file_name):
    content = open('E:/ZBSource/zhaobiao16/%s' % file_name).read()
    article_id = re.search('(\d+)\.html', file_name).group(1)

    cleaner = clean.Cleaner(style=True,
                            scripts=True,
                            comments=True,
                            javascript=True,
                            page_structure=False,
                            safe_attrs_only=False)
    content = cleaner.clean_html(content.decode('utf-8')).encode('utf-8')

    content = content.replace('<p>', '\n<p>')
    content = content.replace('<tr>', '\n<tr>')
    content = content.replace('<div>', '\n<div>')
    content = content.replace('<span>', '\n<span>')
    content = content.replace('<td>', '<td>  ')
    reg = re.compile("<[^>]*>")
    content = reg.sub('', content)
    f = open('E:/ZBSource/zhaobiao16/clean/%s.txt' % article_id, 'wb+')
    lines = content.split('\n')
    for line in lines:
        line = line.strip()
        if len(line) != 0:
            f.write(line + '\n')
    f.close()

Ejemplo n.º 10

0

Mostrar archivo

Archivo: iyiou_com.py Proyecto: oogou11/news_spider

 def parse_body(self, response):
     item = response.meta['item']
     body = response.xpath(".//*[@class='viewpointWrap']")
     tags = body.xpath(
         ".//*[@id='post_industry']/a[@class='post_industry_item']/text()"
     ).extract()  # 标签
     review = body.xpath(".//div[@id='post_brief']/text()").extract()
     if len(review) > 0:
         item['review'] = review[0]
     thumb_url = body.xpath(
         ".//div[@id='post_thumbnail']/img/@src").extract()
     if len(thumb_url) > 0:
         item['thumb_url'] = thumb_url[0]
     if len(tags) > 0:
         item['tags'] = tags
     origin_html = body.xpath(".//div[@id='post_description']/p").extract()
     content = list()
     cleaner = clean.Cleaner(safe_attrs_only=True,
                             safe_attrs=self.__safe_attrs,
                             kill_tags=self.__kill_tags)
     for html_string in origin_html:
         cleaned_html = cleaner.clean_html(html_string)
         content.append(cleaned_html)
     item['content'] = content
     yield item

Ejemplo n.º 11

0

Mostrar archivo

Archivo: email_cleaner2.py Proyecto: raprasad/mcb-finance-utilities

    def pull_html_from_email_file(fname):
        content = open(fname, 'r').read()
        idx = content.find('<html')
        if idx == -1:
            return None
        end_idx = content.find('</html>', idx + 5)
        if end_idx == -1:
            return None

        html_content = content[idx:end_idx + 7]

        # remove line breaks
        strings_to_remove = ['=\r\n', '<o:p>', '</o:p>']
        for to_remove in strings_to_remove:
            html_content = html_content.replace(to_remove, '')

        # remove <img ...> and <span...> tags
        html_content = removetags(html_content, 'p b img span')

        # remove extra attributes
        safe_attrs = clean.defs.safe_attrs
        clean.defs.safe_attrs = frozenset()
        cleaner = clean.Cleaner(safe_attrs_only=True)

        html_content = cleaner.clean_html(html_content)

        return html_content

Ejemplo n.º 12

0

Mostrar archivo

 def __init__(self):
     self.headers = {
         'Accept-Encoding':
         'gzip,deflate',
         'Accept-Language':
         'zh-CN;q=1.0,*;q=0.5',
         'Cache-Control':
         'no-cache',
         'Pragma':
         'no-cache',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
     }
     self.configs = {
         'embedded': False,
         'safe_attrs_only': True,
         'safe_attrs': ['src', 'href', 'height', 'width', 'alt'],
         'remove_tags': ['span'],
     }
     self.pattern = re.compile(
         '['
         u'\U0001F600-\U0001F64F'
         u'\U0001F300-\U0001F5FF'
         u'\U0001F680-\U0001F6FF'
         u'\U0001F1E0-\U0001F1FF'
         ']+',
         flags=re.UNICODE)
     self.hash = lambda x: format(zlib.adler32(x.encode('utf-8'), 0x00), 'x'
                                  )
     self.session = requests.Session()
     self.cleaner = clean.Cleaner(**self.configs)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: mail.py Proyecto: trabacus-softapps/openerp-8.0-cc

def html_sanitize(src, silent=True, strict=False):
    if not src:
        return src
    src = ustr(src, errors='replace')

    logger = logging.getLogger(__name__ + '.html_sanitize')

    # html encode email tags
    part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)",
                      re.IGNORECASE | re.DOTALL)
    src = part.sub(lambda m: cgi.escape(m.group(1)), src)

    kwargs = {
        'page_structure': True,
        'style': False,  # do not remove style attributes
        'forms': True,  # remove form tags
        'remove_unknown_tags': False,
        'allow_tags': allowed_tags,
    }
    if etree.LXML_VERSION >= (2, 3, 1):
        # kill_tags attribute has been added in version 2.3.1
        kwargs.update({
            'kill_tags': tags_to_kill,
            'remove_tags': tags_to_remove,
        })
    else:
        kwargs['remove_tags'] = tags_to_kill + tags_to_remove

    if strict:
        if etree.LXML_VERSION >= (3, 1, 0):
            # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style"
            kwargs.update({
                'safe_attrs_only': True,
                'safe_attrs': safe_attrs,
            })
    else:
        kwargs['safe_attrs_only'] = False  # keep oe-data attributes + style
        kwargs[
            'frames'] = False,  # do not remove frames (embbed video in CMS blogs)

    try:
        # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
        cleaner = clean.Cleaner(**kwargs)
        cleaned = cleaner.clean_html(src)
        # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution
        cleaned = cleaned.replace('%24', '$')
        cleaned = cleaned.replace('%7B', '{')
        cleaned = cleaned.replace('%7D', '}')
        cleaned = cleaned.replace('%20', ' ')
        cleaned = cleaned.replace('%5B', '[')
        cleaned = cleaned.replace('%5D', ']')
    except etree.ParserError, e:
        if 'empty' in str(e):
            return ""
        if not silent:
            raise
        logger.warning('ParserError obtained when sanitizing %r',
                       src,
                       exc_info=True)
        cleaned = '<p>ParserError when sanitizing</p>'

Ejemplo n.º 14

0

Mostrar archivo

Archivo: inputs.py Proyecto: drjova/hepcrawl-1

def remove_attributes_from_tags(text):
    """Removes attributes from e.g. MathML tags"""
    if text:
        try:
            cleaner = clean.Cleaner(safe_attrs_only=True, remove_unknown_tags=False)
            text = cleaner.clean_html(text)
        except lxml.etree.ParserError:
            return text
    return text

Ejemplo n.º 15

0

Mostrar archivo

Archivo: mail.py Proyecto: tedi3231/openerp

def html_sanitize(src):
    if not src:
        return src
    src = ustr(src, errors='replace')

    # html encode email tags
    part = re.compile(r"(<[^<>]+@[^<>]+>)", re.IGNORECASE | re.DOTALL)
    src = part.sub(lambda m: cgi.escape(m.group(1)), src)
    
    # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
    try:
        cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, kill_tags=tags_to_kill, remove_tags=tags_to_remove)
        cleaned = cleaner.clean_html(src)
    except TypeError, e:
        # lxml.clean version < 2.3.1 does not have a kill_tags attribute
        # to remove in 2014
        cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, remove_tags=tags_to_kill+tags_to_remove)
        cleaned = cleaner.clean_html(src)

Ejemplo n.º 16

0

Mostrar archivo

 def _org_ruby_convert(self):
     _, html = utils.shell_command('org-ruby {} --translate html'.format(self.path))
     cleaner = clean.Cleaner(safe_attrs_only=True, safe_attrs=frozenset())
     html = cleaner.clean_html(html)
     # Fix horizontal rule.
     # With org-ruby, it converts dash-lines to '<hr>', which is invalid
     # ENML, converting it to <hr/>
     html = html.replace('<hr>', '<hr/>')
     return html

Ejemplo n.º 17

0

Mostrar archivo

def clean_highlighted_code(html):
    """strip html from syntax-highlighted
    code (pre and code tags)
    """
    cleaner = clean.Cleaner(allow_tags=['pre'], remove_unknown_tags=False)
    for el in html.findall('.//pre'):
        p = el.getparent()
        cleaned = cleaner.clean_html(el)
        p.replace(el, cleaned)

Ejemplo n.º 18

0

Mostrar archivo

 def get_description(self):
     request_url = self.base_url + '/description'
     r = requests.get(request_url)
     soup = BeautifulSoup(r.text, 'html.parser')
     description = soup.find('meta', {'name': 'description'})['content']
     doc = html.fromstring(description)
     cleaner = clean.Cleaner(style=True)
     doc = cleaner.clean_html(doc)
     return doc.text_content()

Ejemplo n.º 19

0

Mostrar archivo

def clean_html(html):
    html = lxml.html.fromstring(html)
    #clean.defs.safe_attrs=frozenset(['href','src'])
    cleaner = clean.Cleaner(allow_tags=[
        'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'pre', 'code',
        'img', 'a', 'br'
    ],
                            safe_attrs_only=True,
                            remove_unknown_tags=False)
    cleaned = cleaner.clean_html(html)
    return lxml.html.tostring(cleaned)

Ejemplo n.º 20

0

Mostrar archivo

 def extract_upinfo(self, sel):
     upinfo_xpath = '//div[@class="upinfo"]'
     cleaner = clean.Cleaner(
         scripts=True,
         javascript=True,
         safe_attrs_only=True,
         safe_attrs=["class", "href", "src", "card", "mid", "title"],
         links=False)
     content = ''.join(sel.xpath(upinfo_xpath).extract())
     clean_data = cleaner.clean_html(lxml.html.fromstring(content))
     data = lxml.html.tostring(clean_data, encoding="utf8")
     return self.remove_space(data)

Ejemplo n.º 21

0

Mostrar archivo

 def __init__(self, html_file, removal_tag_list = []):
     ''' Initialize the Tag statistics '''
     # clean the html
     self.fptr = open(html_file, 'r')
     self.html = self.fptr.read()
     cleaner = clean.Cleaner()
     cleaner.remove_tags = removal_tag_list
     cleaned_html = cleaner.clean_html(self.html)
     self.html = cleaned_html
     wfptr = open('output.html', 'w')
     wfptr.writelines(self.html)
     self._create_final_html()

Ejemplo n.º 22

0

Mostrar archivo

Archivo: digest.py Proyecto: punchagan/r2k

def _clean_js_and_styles(html):
    cleaner = clean.Cleaner(javascript=True, style=True)

    try:
        html = str(
            tostring(cleaner.clean_html(fromstring(html))), encoding='utf8'
        )

    except Exception:
        html = 'Failed to clean js and styles.'

    return html

Ejemplo n.º 23

0

Mostrar archivo

Archivo: html.py Proyecto: jjelosua/newslynx-core

def prepare(htmlstring, source_url=None, safe_attrs=['src', 'href']):
    """
    Cleanse an htmlstring of it's attributes,
    absolutify images and links, ascii-dammify it,
    and clean whitespace.
    """
    if not htmlstring:
        return None
    cleaner = clean.Cleaner(safe_attrs_only=True, safe_attrs=set(safe_attrs))
    cleansed = cleaner.clean_html(htmlstring)
    soup = make_abs(cleansed, source_url)
    cleansed = get_inner(soup)
    return text.prepare(cleansed)

Ejemplo n.º 24

0

Mostrar archivo

def remove_tags(html):
    try:
        kill_tags = ['a', 'img', 'strong', 'em']
        allow_tags = ['p', 'span', 'br']
        cleaner = clean.Cleaner(safe_attrs_only=True,
                                safe_attrs=frozenset(),
                                whitelist_tags=set(allow_tags),
                                remove_tags=kill_tags)
        results = cleaner.clean_html(html)
        return results
    except Exception as e:

        print e

Ejemplo n.º 25

0

Mostrar archivo

    def parse(self, params):
        item = {
            'parser': 'CSS',
            'title': '',
            'pdate': '',
            'content': '',
            'showcontent': ''
        }
        parserTable = self._parserTable
        html = params['html']
        url = params['url']

        try:
            url_ext = tldextract.extract(url).domain
            parser = getsafedictvalue(parserTable, url_ext + "/parser", None)
            cnname = getsafedictvalue(parserTable, url_ext + "/name", "")
            if parser is None:
                return item

            linkurl = url
            docrsp = doc(html, url)
            pubtimeint = 0
            pubtimetxt = ''
            for CSS in parser:
                contraw = docrsp(CSS["content"]).remove("a").remove(
                    "script").remove("style")
                if contraw == None:
                    continue
                item['content'] = contraw.text()
                item['title'] = docrsp(CSS["title"]).text()
                pubtimetxt = docrsp(CSS["date"]).text()
                pubtimeint = parsedate(pubtimetxt)
                if (len(item['content']) > 0) and (len(item['title']) >
                                                   0) and (pubtimeint > 0):
                    break

            if contraw:
                cleaner = clean.Cleaner(page_structure=True)
                showcont = cleaner.clean_html(
                    contraw.remove_attr('id').remove_attr('class').wrapAll(
                        '<div></div>').html())
                showcont = re.sub(r'id=".*?"|class=".*?"', '', showcont)
                showcont = re.sub(r'[\s+]*?>', '>', showcont)
                showcont = showcont.replace("\n", "").replace(
                    "\t", "").replace("<div>", "").replace("</div>", "")
                item['showcontent'] = showcont
                if (pubtimeint > 0):
                    item['pdate'] = ValidateTime(pubtimeint)
        except Exception as e:
            pass
        return item

Ejemplo n.º 26

0

Mostrar archivo

Archivo: grawler.py Proyecto: unamfi/Google-graph

    def __init__( self , url):
        #Obtener pagina
        self.pagina = html.fromstring(urllib.urlopen(url).read())

        #Corregir links
        self.pagina.make_links_absolute(base_url=url,resolve_base_href=True)

        #Obtener links y guardarlos en una lista
        self.links= []
        for link in self.pagina.xpath("//a"):
            test=link.get("href")
            if test!=None and test[0:4]=='http':
                self.links.append(link.get("href"))

        #Obtener texto limpio (sin html, javascript)
        cls = clean.Cleaner(links=False,page_structure=False)
        self.pagina = cls.clean_html(self.pagina)
        self.texto = lxml.html.tostring(self.pagina,encoding='utf-8',pretty_print=True, method='text').split()

        #Contar palabras
        global total
        self.palabras= {}
        self.tam=0
        for pala in self.texto:
            pala = re.sub(r'\W+', '', pala) #Elminar caracteres no alfanumericos
            pala=pala.lower()
            if not pala=='' and pala not in irrelev:
                if not self.palabras.has_key(pala):
                    self.palabras[pala]=1.0
                else:
                    self.palabras[pala]+=1.0
                self.tam+=1
                total+=1
                if not palabras.has_key(pala):
                    palabras[pala]=1.0
                else:
                    palabras[pala]+=1.0        


        #Relacion de cada palabra con el total para este documento
        for i in self.palabras:
            self.palabras[i]=self.palabras[i]/self.tam

        #Guardar
        global count
        f=open("crawl"+str(count),'w')
        count+=1
        print>>f,url
        for item in sorted(self.palabras.iteritems(),key=operator.itemgetter(1),reverse=True):
            print>>f, item
        f.close()

Ejemplo n.º 27

0

Mostrar archivo

Archivo: tools.py Proyecto: yvanss/trapper

def clean_html(value):
    """
    Clean html value and strip potentially dangerous code using
    :class:`lxml.html.clean.Cleaner`
    """
    cleaned = ''
    if value and value.strip():
        cleaner = clean.Cleaner(safe_attrs_only=True,
                                safe_attrs=frozenset(['href']))
        cleaned = cleaner.clean_html(value)
        # Cleaner wraps with p tag, it should be removed
        if cleaned.startswith('<p>') and cleaned.endswith('</p>'):
            cleaned = cleaned[3:-4]
    return cleaned

Ejemplo n.º 28

0

Mostrar archivo

def get_info(text: str):
    cleaner = clean.Cleaner(scripts=True,
                            javascript=True,
                            page_structure=False,
                            safe_attrs_only=False,
                            style=True,
                            kill_tags=['span', 'font', 'a'])
    text = cleaner.clean_html(text)
    html = etree.HTML(text)
    print(html.xpath('//h3[@class="t"]/a/em')[0].text)
    print(html.xpath('//h3[@class="t"]/a/text()'))
    print(html.xpath('string(//h3[@class="t"]/a)'))
    for i in html.xpath('//h3[@class="t"]/a'):
        print(i.xpath('string()'))

Ejemplo n.º 29

0

Mostrar archivo

def html_sanitize(src, silent=True):
    if not src:
        return src
    src = ustr(src, errors='replace')

    logger = _logger.getChild('html_sanitize')

    # html encode email tags
    part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
    src = part.sub(lambda m: cgi.escape(m.group(1)), src)

    kwargs = {
        'page_structure': True,
        'style': False,             # do not remove style attributes
        'forms': True,              # remove form tags
    }
    if etree.LXML_VERSION >= (2, 3, 1):
        # kill_tags attribute has been added in version 2.3.1
        kwargs.update({
            'kill_tags': tags_to_kill,
            'remove_tags': tags_to_remove,
        })
    else:
        kwargs['remove_tags'] = tags_to_kill + tags_to_remove

    if etree.LXML_VERSION >= (3, 1, 0):
        kwargs.update({
            'safe_attrs_only': True,
            'safe_attrs': clean.defs.safe_attrs | set(['style']),
        })
    else:
        # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style"
        kwargs['safe_attrs_only'] = False

    try:
        # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
        cleaner = clean.Cleaner(**kwargs)
        cleaned = cleaner.clean_html(src)
    except etree.ParserError:
        if not silent:
            raise
        logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True)
        cleaned = '<p>ParserError when sanitizing</p>'
    except Exception:
        if not silent:
            raise
        logger.warning('unknown error obtained when sanitizing %r', src, exc_info=True)
        cleaned = '<p>Unknown error when sanitizing</p>'
    return cleaned

Ejemplo n.º 30

0

Mostrar archivo

def download(url):
    browser.get(url)
    content = browser.page_source

    cleaner = clean.Cleaner()
    content = cleaner.clean_html(content)
    doc = html.fromstring(content)
    soup = BeautifulSoup(content, 'html.parser')

    img = soup.find("img",{"id": "landingImage"})['src']

    print(img)
    name=str(uuid.uuid4())+".jpg"
    urllib.request.urlretrieve(img,"product_images/"+name)
    return  "UploadedContent/"+name