Python Cleaner.inline_style Exemples

Langage de programmation: Python

Espace de nommage/Pack: lxml.html.clean

Class/Type: Cleaner

Méthode/Fonction: inline_style

Exemples au hotexamples.com: 4

Python Cleaner.inline_style - 4 exemples trouvés. Ce sont les exemples réels les mieux notés de lxml.html.clean.Cleaner.inline_style extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

Cleaner(30)

clean_html(30)

style(30)

kill_tags(30)

javascript(30)

remove_tags(23)

scripts(21)

page_structure(19)

meta(19)

links(16)

remove_unknown_tags(15)

comments(14)

allow_tags(13)

safe_attrs_only(12)

embedded(11)

forms(11)

frames(9)

annoying_tags(8)

html(7)

processing_instructions(7)

inline_style(4)

safe_attrs(3)

xpath(2)

add_nofollow(2)

__call__(2)

allow_tag(1)

javasript(1)

remove_attributes(1)

host_whitelist(1)

replace(1)

frame(1)

embeded(1)

script(1)

allow_attributes(1)

startswith(1)

__init__(1)

whitelist_tags(1)

allow_embedded_url(1)

Méthodes fréquemment utilisées

Cleaner (30)

clean_html (30)

style (30)

kill_tags (30)

javascript (30)

remove_tags (23)

scripts (21)

page_structure (19)

meta (19)

links (16)

Méthodes fréquemment utilisées

remove_unknown_tags (15)

comments (14)

allow_tags (13)

safe_attrs_only (12)

embedded (11)

forms (11)

frames (9)

annoying_tags (8)

html (7)

processing_instructions (7)

inline_style (4)

safe_attrs (3)

xpath (2)

add_nofollow (2)

__call__ (2)

allow_tag (1)

javasript (1)

remove_attributes (1)

host_whitelist (1)

replace (1)

Méthodes fréquemment utilisées

inline_style (4)

safe_attrs (3)

xpath (2)

add_nofollow (2)

__call__ (2)

allow_tag (1)

javasript (1)

remove_attributes (1)

host_whitelist (1)

replace (1)

frame (1)

embeded (1)

script (1)

allow_attributes (1)

startswith (1)

__init__ (1)

whitelist_tags (1)

allow_embedded_url (1)

Méthodes fréquemment utilisées

frame (1)

embeded (1)

script (1)

allow_attributes (1)

startswith (1)

__init__ (1)

whitelist_tags (1)

allow_embedded_url (1)

Exemple #1

0

Afficher le fichier

Fichier : crawl.py Projet : mmzz42/hactar

def cleanpage(html): # cleaner setup cleaner = Cleaner() cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False cleaner.javascript = True # activate the javascript filter cleaner.style = True # activate the styles & stylesheet filter cleaner.links = False cleaner.frames = True cleaner.embedded = True cleaner.comments = True cleaner.annoying_tags = True cleaner.inline_style = True cleaner.page_structure = False # cleaner.remove_tags = ['b','img','h'] cleaner.kill_tags = ['img','script'] #invoke cleaner try: content=cleaner.clean_html(html) except: #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr content = u"" return content

Exemple #2

0

Afficher le fichier

def get_text(self, html_content: str): cleaner = Cleaner() cleaner.style = True cleaner.inline_style = True cleaned = cleaner.clean_html(html_content) soup = BeautifulSoup(cleaned, 'lxml') text_lines = soup.findAll(text=True) text_lines_merged = [] merge_str = '' text_lines_merged.append(text_lines[0]) for line in text_lines[1:]: if '\n' == line or '' == line or ' ' == line: if merge_str is not '': text_lines_merged.append(merge_str) merge_str = '' else: merge_str += (' ' + line) text_lines_merged = [ self.strip(line) for line in text_lines_merged if len(self.strip(line)) > 128 ] print(' '.join(text_lines_merged))

Exemple #3

0

Afficher le fichier

Fichier : html_helper.py Projet : wuhaifengdhu/DevinHelper

#!/usr/bin/python # -*- coding: utf-8 -*- from __future__ import print_function import re import os import lxml from bs4 import BeautifulSoup from lxml.html.clean import Cleaner from lxml.etree import XMLSyntaxError from store_helper import StoreHelper from text_helper import TextHelper cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.inline_style = True cleaner.whitelist_tags = set([]) cleaner.remove_tags = [ 'p', 'ul', 'li', 'b', 'br', 'article', 'div', 'body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'span' ] cleaner.kill_tags = ['footer', 'a', 'noscript', 'header', 'label'] class HTMLHelper(object): @staticmethod def remove_tag(web_source): text = re.sub(r'<[^>]+>', '', web_source) return text @staticmethod

Exemple #4

0

Afficher le fichier

def scrape(lineHashDB, html, encoding): # cleaner setup cleaner = Cleaner(allow_tags=['div', 'p'], remove_unknown_tags=False) cleaner.javascript = True # activate the javascript filter cleaner.style = True # activate the styles & stylesheet filter cleaner.comments = True cleaner.annoying_tags = True cleaner.inline_style = True cleaner.page_structure = False cleaner.remove_tags = ['b', 'a', 'h'] cleaner.kill_tags = ['script'] #invoke cleaner try: page = cleaner.clean_html(html) except: #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr content = u"" return content page8 = page page8 = re.sub(u'\n', ' ', page8) # remove NL # page8 = re.sub(u'\s','',page8,re.UNICODE) # blanks -> space page8 = re.sub(u'', ' ', page8) # remove CR page8 = re.sub(u'', ' ', page8) # remove comments page8 = re.sub(u' class=".*?"', ' ', page8) # remove attributes page8 = re.sub(u' id=".*?"', ' ', page8) page8 = re.sub(u' rel=".*?"', ' ', page8) page8 = re.sub(u'\[an error occurred while processing this directive\]', ' ', page8) page8 = re.sub(u'>\s*?<', '><', page8) # remove blanks between tags # cycle to remove spurious divs for count in range(1, 20): page8 = re.sub(u'>.{0,10}<', '><', page8) # remove words under 10 chars between tags page8 = re.sub(u'<div></div>', ' ', page8) page8 = re.sub(u'<p></p>', ' ', page8) page8 = re.sub(u'<span></span>', ' ', page8) page8 = re.sub(u'\s+', ' ', page8) # remove repeated blanks #XPATHs xpath = '//*[((p) or (a) or (b) or (div) or (span)) ]/node()[(string-length() > 300)]/text()' xpath = '//*[((p) or (div))]/node()[(string-length() > 100)]/text()' sel = Selector(text=page8, type="html") text = sel.xpath(xpath).extract() content = u"" if text: for s in text: # squash duplicate whitespaces ' '.join(s.split()) # remove short lines # on empirical analysis, no unfrequent sentence under 40 chars is a relevant part of the article text, excluding repetition of title, authors, dates, etc. if len(s) < 40: next # remove leading whitespace #if s.endswith(" "): s = s[:-1] if s.startswith(" "): s = s[1:] content += s content += "\n" return content