Esempio n. 1
0
def cleanpage(html):
	# cleaner setup
	cleaner = Cleaner()
        cleaner.html = True
        cleaner.page_structure = False
        cleaner.meta = False
        cleaner.safe_attrs_only = False
        cleaner.links = False
	cleaner.javascript = True # activate the javascript filter
	cleaner.style = True      #  activate the styles & stylesheet filter
        cleaner.links = False
        cleaner.frames = True
        cleaner.embedded = True
	cleaner.comments = True
	cleaner.annoying_tags = True
	cleaner.inline_style = True
	cleaner.page_structure = False
#	cleaner.remove_tags = ['b','img','h']
	cleaner.kill_tags = ['img','script']
	
	#invoke cleaner
        try:
            content=cleaner.clean_html(html)
        except:
            #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr 
            content = u""
        return content
Esempio n. 2
0
    def clearTag_old(self, text: str) -> str:
        import lxml
        from lxml.html.clean import Cleaner

        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.links = True
        cleaner.meta = True
        cleaner.forms = True
        cleaner.embedded = True
        cleaner.frames = True
        cleaner.remove_unknown_tags = True
        cleaner.kill_tags = ["img"]
        cleaner.remove_tags = [
            "strong",
            "div",
            "body",
            "br",
            "a",
            "p",
            "blockquote",
            "h3",
            "ol",
            "li",
            "font",
        ]
        return cleaner.clean_html(
            lxml.html.document_fromstring(text)).decode("utf-8")
Esempio n. 3
0
def lxml_extractor(html, url):
    '''LXML PARSER'''
    cleaner = Cleaner()
    cleaner.javascript = True # This is True because we want to activate the javascript filter
    cleaner.style = True      # This is True because we want to activate the styles & stylesheet filter
    cleaner.comments = True
    cleaner.embedded = True
    cleaner.forms= True
    cleaner.frames = True
    cleaner.annoying_tags = True
    cleaner.kill_tags = NEGATIVE_K 
    cleaner.allow_tag = POSITIVE_K
    cleaner.safe_attrs_only = True
    #~ oc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
  #~ File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 752, in document_fromstring
    #~ value = etree.fromstring(html, parser, **kw)
    try:
        html = lxml.html.fromstring(html, base_url="url")
    
        tree = cleaner.clean_html(html)
        #tree.make_links_absolute(url)
        doc = lxml.html.tostring(tree)
        doc = soup_extractor(doc, url)
    except ValueError:
        doc = soup_extractor(html, url)
    
    #~ (title, doc, article, text) = read_extractor(html, url)
    #~ print title
    #~ doc = (self.doc).replace(unichr(160), " ")
    #~ doc = re.sub(spaces,"",self.doc)
    return doc
Esempio n. 4
0
def clean_html(html_text,
               javascript=True,
               scripts=True,
               style=True,
               embedded=True,
               links=True,
               forms=True,
               frames=True,
               comments=True,
               annoying_tags=True,
               meta=True,
               safe_attrs_only=True,
               remove_unknown_tags=True,
               processing_instructions=True):
    """Clean all the javascript and styles from the HTML returning the string with only the html content"""
    # True = Remove | False = Keep
    cleaner = Cleaner()
    cleaner.javascript = javascript  # This is True because we want to activate the javascript filter
    cleaner.scripts = scripts  # This is True because we want to activate the scripts filter
    cleaner.style = style
    cleaner.embedded = embedded
    cleaner.links = links
    cleaner.forms = forms
    cleaner.frames = frames
    cleaner.comments = comments
    cleaner.page_structure = False  # Keep page structure
    cleaner.annoying_tags = annoying_tags
    cleaner.meta = meta
    cleaner.safe_attrs_only = safe_attrs_only
    cleaner.remove_unknown_tags = remove_unknown_tags
    cleaner.processing_instructions = processing_instructions
    clean_content = cleaner.clean_html(lxml.html.fromstring(html_text))
    return lxml.html.tostring(clean_content)
Esempio n. 5
0
def cleaner_parameters():
    reject_list = [
        'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label',
        'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math'
    ]
    accept_list = [
        'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
        'span', 'b', 'a', 'u', 'i', 'body'
    ]
    html_cleaner = Cleaner()
    html_cleaner.remove_unknown_tags = True
    html_cleaner.processing_instructions = True
    html_cleaner.style = True
    html_cleaner.comments = True
    html_cleaner.scripts = True
    html_cleaner.javascript = True
    html_cleaner.meta = True
    html_cleaner.links = True
    html_cleaner.embedded = True
    html_cleaner.annoying_tags = True
    html_cleaner.frames = True
    html_cleaner.forms = True
    html_cleaner.remove_tags = accept_list
    html_cleaner.kill_tags = reject_list
    return html_cleaner
Esempio n. 6
0
    def validate(self, data):
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.scripts = True
        cleaner.frames = True
        cleaner.remove_tags = ["p", "div", "a"]
        data["name"] = (lxml.html.document_fromstring(cleaner.clean_html(data["name"]))).text_content()

        if data["qty"] < 0:
            data["qty"] = 0
        return data
def get_cleaner():
    cleaner = Cleaner()
    cleaner.embedded = True
    cleaner.frames = True
    cleaner.style = True
    cleaner.remove_unknown_tags = True
    cleaner.processing_instructions = True
    cleaner.annoying_tags = True
    cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'b', 'a', 'u', 'i', 'body', 'div', 'span', 'p']
    cleaner.kill_tags = ['table', 'img', 'semantics', 'script', 'noscript', 'style', 'meta', 'label', 'li', 'ul',
                         'ol', 'sup', 'math', 'nav', 'dl', 'dd', 'sub']
    return cleaner
Esempio n. 8
0
	def clean(self):
		cleaner= Cleaner(page_structure=False)
		cleaner.javascript = True
		cleaner.scripts = True
		cleaner.frames = True
		cleaner.allow_tags = []
		cleaner.remove_tags = ['p', 'div', 'a']
		self.name= (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content()
		self.price = (lxml.html.document_fromstring(cleaner.clean_html(self.price))).text_content()
		self.discountcode = (lxml.html.document_fromstring(cleaner.clean_html(self.discountcode))).text_content()
		self.categorycode= (lxml.html.document_fromstring(cleaner.clean_html(self.categorycode))).text_content()
		self.orderdate= (lxml.html.document_fromstring(cleaner.clean_html(self.orderdate))).text_content()
		self.selldate= (lxml.html.document_fromstring(cleaner.clean_html(self.selldate))).text_content()
		self.page= (lxml.html.document_fromstring(cleaner.clean_html(self.page))).text_content()		
Esempio n. 9
0
    def validate(self, value):
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.scripts = True
        cleaner.frames = True
        cleaner.remove_tags = ["p", "div", "a"]
        data["username"] = (lxml.html.document_fromstring(cleaner.clean_html(data["username"]))).text_content()
        data["storename"] = (lxml.html.document_fromstring(cleaner.clean_html(data["storename"]))).text_content()
        data["email"] = (lxml.html.document_fromstring(cleaner.clean_html(data["email"]))).text_content()

        # 		data['username']=  cleaner.clean_html(data['username'])
        #               data['storename']= cleaner.clean_html(data['storename'])
        #              data['email']= cleaner.clean_html(data['email'])

        return data
Esempio n. 10
0
    def validate(self, data):
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.scripts = True
        cleaner.frames = True
        cleaner.remove_tags = ["p", "div", "a"]

        # (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content()
        data["name"] = (lxml.html.document_fromstring(cleaner.clean_html(data["name"]))).text_content()
        data["price"] = (lxml.html.document_fromstring(cleaner.clean_html(data["price"]))).text_content()
        data["itemid"] = (lxml.html.document_fromstring(cleaner.clean_html(data["itemid"]))).text_content()
        data["discountcode"] = (lxml.html.document_fromstring(cleaner.clean_html(data["discountcode"]))).text_content()
        data["orderdate"] = (lxml.html.document_fromstring(cleaner.clean_html(data["orderdate"]))).text_content()
        data["selldate"] = (lxml.html.document_fromstring(cleaner.clean_html(data["selldate"]))).text_content()
        data["page"] = (lxml.html.document_fromstring(cleaner.clean_html(data["page"]))).text_content()

        if data[qty] < 0:
            data[qty] = 0

        #           self.name= cleaner.clean_html(self.name)
        return data
Esempio n. 11
0
	def clean(self):
                cleaner= Cleaner()
                cleaner.javascript = True
                cleaner.scripts = True
                cleaner.frames = True
		self.name= (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content()  #lxml.html.fromstring(self.name) ) 
Esempio n. 12
0
import lxml
from lxml import etree
from lxml.html.clean import autolink_html
from lxml.html.clean import Cleaner

#LXML PARSER
cleaner = Cleaner()
cleaner.javascript = True # This is True because we want to activate the javascript filter
cleaner.style = True      # This is True because we want to activate the styles & stylesheet filter
cleaner.comments = True
cleaner.embedded = True
cleaner.forms= True
cleaner.frames = True
#cleaner.safe_attrs_only = True

import re
notalpha = re.compile('[^a-zA-Z]')

#BS PARSER
from bs4 import BeautifulSoup as bs
from bs4 import Comment

def make_links_absolute(soup, url):
    return [urlparse.urljoin(url, tag['href']) for tag in soup.findAll('a', href=True)]
                        
def clean_html(soup):
    soup = bs(" ".join([s.extract() for s in soup('script')]))
    soup = bs(" ".join([s.extract() for s in soup('iframe')]))
    soup = bs(" ".join([s.extract() for s in soup('form')]))
    soup = bs(" ".join([s.extract() for s in soup('embed')]))
    soup = bs(" ".join([s.extract() for s in soup('style')]))
Esempio n. 13
0
from .filters import duplicate_test, textfilter
from .settings import CUT_EMPTY_ELEMS, DEFAULT_CONFIG, MANUALLY_CLEANED, MANUALLY_STRIPPED
from .utils import trim

LOGGER = logging.getLogger(__name__)

# HTML_CLEANER config
# http://lxml.de/api/lxml.html.clean.Cleaner-class.html
# https://lxml.de/apidoc/lxml.html.clean.html
HTML_CLEANER = Cleaner()
HTML_CLEANER.annoying_tags = False  # True
HTML_CLEANER.comments = True
HTML_CLEANER.embedded = False  # True
HTML_CLEANER.forms = False  # True
HTML_CLEANER.frames = False  # True
HTML_CLEANER.javascript = False
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False
HTML_CLEANER.style = False
#HTML_CLEANER.remove_tags = MANUALLY_STRIPPED
#HTML_CLEANER.kill_tags = MANUALLY_CLEANED


def tree_cleaning(tree, include_tables, include_images=False):
    '''Prune the tree by discarding unwanted elements'''
Esempio n. 14
0
    "//*[contains(@class, 'date') or contains(@class, 'Date') or contains(@class, 'datum') or contains(@class, 'Datum')]",
    "//*[contains(@class, 'postmeta') or contains(@class, 'post-meta') or contains(@class, 'entry-meta') or contains(@class, 'postMeta') or contains(@class, 'post_meta') or contains(@class, 'post__meta')]",
    "//*[@class='meta' or @class='meta-before' or @class='asset-meta' or contains(@id, 'article-metadata') or contains(@class, 'article-metadata') or contains(@class, 'byline') or contains(@class, 'subline')]",
    "//*[contains(@class, 'published') or contains(@class, 'posted') or contains(@class, 'submitted') or contains(@class, 'created-post')]",
    "//*[contains(@id, 'lastmod') or contains(@itemprop, 'date') or contains(@class, 'time')]",
    "//footer",
    "//*[@class='post-footer' or @class='footer' or @id='footer']",
    "//small",
    "//*[contains(@class, 'author') or contains(@class, 'autor') or contains(@class, 'field-content') or @class='meta' or contains(@class, 'info') or contains(@class, 'fa-clock-o')]",
]

CLEANER = Cleaner()
CLEANER.comments = False
CLEANER.embedded = True
CLEANER.forms = False
CLEANER.frames = True
CLEANER.javascript = True
CLEANER.links = False
CLEANER.meta = False
CLEANER.page_structure = True
CLEANER.processing_instructions = True
CLEANER.remove_unknown_tags = False
CLEANER.safe_attrs_only = False
CLEANER.scripts = False
CLEANER.style = True
CLEANER.kill_tags = [
    'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf',
    'svg', 'video'
]  # 'embed', 'figure', 'img', 'table'

## REGEX cache
Esempio n. 15
0
import rake
from bs4 import BeautifulSoup
import urllib.request
import sys
import testApp.processing as process

import re
from lxml.html.clean import Cleaner
cleaner = Cleaner()
cleaner.javascript = True  # This is True because we want to activate the javascript filter
cleaner.style = True
cleaner.scripts = True
cleaner.links = True
cleaner.meta = True
cleaner.page_structure = True
cleaner.frames = True
cleaner.forms = True
cleaner.annoying_tags = True


def get_url_content(url):
    try:
        with urllib.request.urlopen(url) as page:
            text = page.read()
    except Exception as e:
        return "Couldn't load url"
    return text


def index(request):
    return HttpResponse("Hello, world. You're at the homepage.")
Esempio n. 16
0
                ]

COMMENTS_DISCARD_XPATH = ['.//*[(self::div or self::section)][starts-with(@id, "respond")]', \
                          './/cite', \
                          './/quote', \
                          './/*[starts-with(@id, "reply-") or starts-with(@class, "reply-title")]', \
                          './/*[contains(@id, "akismet") or contains(@class, "akismet")]', \
                         ]

# HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html
HTML_CLEANER = Cleaner()
HTML_CLEANER.annoying_tags = True
HTML_CLEANER.comments = True
HTML_CLEANER.embedded = True
HTML_CLEANER.forms = True
HTML_CLEANER.frames = True
HTML_CLEANER.javascript = True
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = True
HTML_CLEANER.style = False
HTML_CLEANER.remove_tags = [
    'a', 'abbr', 'acronym', 'address', 'big', 'cite', 'font', 'ins', 'meta',
    'small', 'sub', 'sup', 'wbr'
]  #  'center', 'table', 'tbody', 'td', 'th', 'tr', 'span',
HTML_CLEANER.kill_tags = [
    'aside', 'audio', 'canvas', 'embed', 'figure', 'footer', 'form', 'head',