Esempio n. 1
0
def clear_html(in_str: str, tag_whitelist: list = []):
    """清除html或js"""
    if not in_str:
        return in_str

    cleaner = Cleaner(allow_tags=tag_whitelist, remove_unknown_tags=False)
    if not tag_whitelist:
        cleaner.html = True
    cleaner.javascript = True
    cleaner.html = True
    out_str = cleaner.clean_html(in_str)

    return re.sub(r'</p>$', '', re.sub(r'^<p>', '', out_str))
Esempio n. 2
0
def cleanpage(html):
	# cleaner setup
	cleaner = Cleaner()
        cleaner.html = True
        cleaner.page_structure = False
        cleaner.meta = False
        cleaner.safe_attrs_only = False
        cleaner.links = False
	cleaner.javascript = True # activate the javascript filter
	cleaner.style = True      #  activate the styles & stylesheet filter
        cleaner.links = False
        cleaner.frames = True
        cleaner.embedded = True
	cleaner.comments = True
	cleaner.annoying_tags = True
	cleaner.inline_style = True
	cleaner.page_structure = False
#	cleaner.remove_tags = ['b','img','h']
	cleaner.kill_tags = ['img','script']
	
	#invoke cleaner
        try:
            content=cleaner.clean_html(html)
        except:
            #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr 
            content = u""
        return content
    def get_clean_html(self, html_text, text_only=True):
        try:
            etree = lxml.html.document_fromstring(html_text)

            self._is_etree(etree)
            # enable filters to remove Javascript and CSS from HTML document
            cleaner = Cleaner()
            cleaner.javascript = True
            cleaner.style = True
            cleaner.html = True
            cleaner.page_structure = False
            cleaner.meta = False
            cleaner.safe_attrs_only = False
            cleaner.links = False

            html = cleaner.clean_html(etree)
            if text_only:
                return ' '.join(html.text_content().split())
                # return html.text_content()

            res = lxml.html.tostring(html)
        except Exception as e:
            logger.error(f"While parsing email in get_clean_html {e}")
            res = "junk"

        return res
def url2count(title):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html= True
    
    r = requests.get(makeurl(title), timeout=5) #r.text
    lxclean = cleaner.clean_html(r.text.replace('\t',' ').replace('\n',' ').replace('\r',' '))
    text = nltk.clean_html(lxclean)
    collapsewhitespace = re.sub(r'\s{2,}', ' ', text)
    nonPunct = re.compile('.*[A-Za-z0-9].*') 
    article_list = [w for w in collapsewhitespace.split(' ') if nonPunct.match(w)]
    article_length = len(article_list)
    return(article_length)
Esempio n. 5
0
def url2count(title):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html= True
    
    r = requests.get(makeurl(title), timeout=5) #r.text
    lxclean = cleaner.clean_html(r.text.replace('\t',' ').replace('\n',' ').replace('\r',' '))
    text = nltk.clean_html(lxclean)
    collapsewhitespace = re.sub(r'\s{2,}', ' ', text)
    nonPunct = re.compile('.*[A-Za-z0-9].*') 
    article_list = [w for w in collapsewhitespace.split(' ') if nonPunct.match(w)]
    article_length = len(article_list)
    return(article_length)
Esempio n. 6
0
def get_clean_html(etree, text_only=False):
    _is_etree(etree)
    # enable filters to remove Javascript and CSS from HTML document
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html = True
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.safe_attrs_only = False
    cleaner.links = False
    
    html = cleaner.clean_html(etree)
    if text_only:
        return html.text_content()

    return lxml.html.tostring(html)
Esempio n. 7
0
def get_clean_html(etree, text_only=False):
    _is_etree(etree)
    # enable filters to remove Javascript and CSS from HTML document
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html = True
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.safe_attrs_only = False
    cleaner.links = False

    html = cleaner.clean_html(etree)
    if text_only:
        return html.text_content()

    return lxml.html.tostring(html)
Esempio n. 8
0
def get_clean_text(filename):
    utf8_parser = html.HTMLParser(encoding='utf-8')
    htmltxt = lxml.html.parse(filename, parser=utf8_parser)
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html = True
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.safe_attrs_only = False
    cleaner.links = False

    htmltxt = cleaner.clean_html(htmltxt)

    txt = etree.tostring(htmltxt, encoding='unicode')
    txtresub = re.sub(r'<.+?>', ' ', txt)
    txtresub = re.sub(r'(\s|&?(amp;|apos;|quot;|gt;|lt;|nbsp;))+', ' ',
                      txtresub)

    return txtresub
Esempio n. 9
0
    def handle(self, **options):
        since = get_last_change()
        writer = get_writer()

        last_change = since
        while True:
            doc = {}

            changes = settings.db.changes(since=since)
            since = changes["last_seq"]

            if since != last_change:
                print("Detected new tasks ".format(len(changes)))
                print("=== changes ===")
                pprint(changes)
            for changeset in changes["results"]:
                try:
                    doc = settings.db[changeset["id"]]
                except couchdb.http.ResourceNotFound:
                    print("resource not found")
                    continue

            if not ("type" in doc and "page" in doc["type"]):
                if since != last_change:
                    print("not processing doc: {}".format(str(doc)))
                    last_change = since
                continue

            print("indexing", doc["url"])

            #####
            # raw, html, text
            #####################
            raw = doc["content"]
            print("type(RAW) = %s" % type(raw))
            tree = document_fromstring(str(raw))
            title = " ".join([title for title in tree.xpath("//title/text()")])

            # enable filters to remove Javascript and CSS from HTML document
            cleaner = Cleaner()
            cleaner.javascript = True
            cleaner.style = True
            cleaner.html = True
            cleaner.page_structure = False
            cleaner.meta = False
            cleaner.safe_attrs_only = False
            cleaner.links = False

            html = cleaner.clean_html(tree)
            text_content = html.text_content()

            lxml.html.tostring(html)
            description = " ".join(tree.xpath("//meta[@name='description']/@content"))

            writer.update_document(
                title=title,
                url=doc["url"],
                desc=description,
                rank=doc["rank"],
                content="\n".join([title, doc["url"], text_content]),
                raw=raw,
            )

            writer.commit()
            writer = get_writer()
            set_last_change(since)
            last_change = since
Esempio n. 10
0
try:
    readline.read_history_file(histfile)
except IOError:
    pass

try:
    from lxml.html.clean import Cleaner
    import lxml
    from lxml.html import document_fromstring
    import requests
    resp = requests.get('http://en.wikipedia.org/')
    tree = document_fromstring(resp.text)
    raw = resp.text
    # enable filters to remove Javascript and CSS from HTML document                                                                     
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html = True
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.safe_attrs_only = False
    cleaner.links = False

    html = cleaner.clean_html(tree)
    text_content = html.text_content()
except ImportError:
    pass

atexit.register(readline.write_history_file, histfile)
del os, histfile
Esempio n. 11
0
    def handle(self, **options):
        since = get_last_change()
        writer = get_writer()

        last_change = since
        while True:
            doc = {}

            changes = settings.db.changes(since=since)
            since = changes["last_seq"]

            if since != last_change:
                print("Detected new tasks ".format(len(changes)))
                print("=== changes ===")
                pprint(changes)
            for changeset in changes["results"]:
                try:
                    doc = settings.db[changeset["id"]]
                except couchdb.http.ResourceNotFound:
                    print("resource not found")
                    continue

            if not ("type" in doc and "page" in doc["type"]):
                if since != last_change:
                    print("not processing doc: {}".format(str(doc)))
                    last_change = since
                continue

            print("indexing", doc["url"])

            #####
            # raw, html, text
            #####################
            raw = doc['content']
            print("type(RAW) = %s" % type(raw))
            tree = document_fromstring(str(raw))
            title = ' '.join([title for title in tree.xpath('//title/text()')])

            # enable filters to remove Javascript and CSS from HTML document
            cleaner = Cleaner()
            cleaner.javascript = True
            cleaner.style = True
            cleaner.html = True
            cleaner.page_structure = False
            cleaner.meta = False
            cleaner.safe_attrs_only = False
            cleaner.links = False

            html = cleaner.clean_html(tree)
            text_content = html.text_content()

            lxml.html.tostring(html)
            description = ' '.join(
                tree.xpath("//meta[@name='description']/@content"))

            writer.update_document(
                title=title,
                url=doc['url'],
                desc=description,
                rank=doc['rank'],
                content='\n'.join([title, doc['url'], text_content]),
                raw=raw,
            )

            writer.commit()
            writer = get_writer()
            set_last_change(since)
            last_change = since