Esempio n. 1
0
def clean_html(html_text,
               javascript=True,
               scripts=True,
               style=True,
               embedded=True,
               links=True,
               forms=True,
               frames=True,
               comments=True,
               annoying_tags=True,
               meta=True,
               safe_attrs_only=True,
               remove_unknown_tags=True,
               processing_instructions=True):
    """Clean all the javascript and styles from the HTML returning the string with only the html content"""
    # True = Remove | False = Keep
    cleaner = Cleaner()
    cleaner.javascript = javascript  # This is True because we want to activate the javascript filter
    cleaner.scripts = scripts  # This is True because we want to activate the scripts filter
    cleaner.style = style
    cleaner.embedded = embedded
    cleaner.links = links
    cleaner.forms = forms
    cleaner.frames = frames
    cleaner.comments = comments
    cleaner.page_structure = False  # Keep page structure
    cleaner.annoying_tags = annoying_tags
    cleaner.meta = meta
    cleaner.safe_attrs_only = safe_attrs_only
    cleaner.remove_unknown_tags = remove_unknown_tags
    cleaner.processing_instructions = processing_instructions
    clean_content = cleaner.clean_html(lxml.html.fromstring(html_text))
    return lxml.html.tostring(clean_content)
Esempio n. 2
0
 def clean_article_html(cls, node):
     article_cleaner = Cleaner()
     article_cleaner.javascript = True
     article_cleaner.style = True
     article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'em']
     article_cleaner.remove_unknown_tags = False
     return article_cleaner.clean_html(node)
Esempio n. 3
0
def html2content(html, allowed_tags=["a", "abbr", "article", "aside",
                                     "b", "base", "blockquote", "body",
                                     "br", "caption", "cite", "code", "col", "colgroup",
                                     "dd", "del", "dfn", "dl", "dt",
                                     "em", "embed", "figcaption", "figure", "footer",
                                     "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html",
                                     "i", "img",
                                     "li",
                                     "map", "mark", "math", "meta", "meter",
                                     "nav", "noscript",
                                     "object", "ol", "optgroup", "option", "output",
                                     "p", "param", "pre", "progress",
                                     "q", "rp", "rt", "ruby",
                                     "s", "samp", "section", "small", "source", "span", "strong", "sub", "sup", "svg",
                                     "table", "tbody", "td", "th", "thead", "tfoot", "time", "title", "tr", "track",
                                     "u", "ul",
                                     "var", "video",
                                     "wbr"]):
    cleaner = Cleaner()
    cleaner.allow_tags = allowed_tags
    cleaner.remove_unknown_tags = False
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.style = True
    cleaner.embeded = False
    return cleaner.clean_html(html)
Esempio n. 4
0
    def clearTag_old(self, text: str) -> str:
        import lxml
        from lxml.html.clean import Cleaner

        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.links = True
        cleaner.meta = True
        cleaner.forms = True
        cleaner.embedded = True
        cleaner.frames = True
        cleaner.remove_unknown_tags = True
        cleaner.kill_tags = ["img"]
        cleaner.remove_tags = [
            "strong",
            "div",
            "body",
            "br",
            "a",
            "p",
            "blockquote",
            "h3",
            "ol",
            "li",
            "font",
        ]
        return cleaner.clean_html(
            lxml.html.document_fromstring(text)).decode("utf-8")
Esempio n. 5
0
def cleaner_parameters():
    reject_list = [
        'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label',
        'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math'
    ]
    accept_list = [
        'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
        'span', 'b', 'a', 'u', 'i', 'body'
    ]
    html_cleaner = Cleaner()
    html_cleaner.remove_unknown_tags = True
    html_cleaner.processing_instructions = True
    html_cleaner.style = True
    html_cleaner.comments = True
    html_cleaner.scripts = True
    html_cleaner.javascript = True
    html_cleaner.meta = True
    html_cleaner.links = True
    html_cleaner.embedded = True
    html_cleaner.annoying_tags = True
    html_cleaner.frames = True
    html_cleaner.forms = True
    html_cleaner.remove_tags = accept_list
    html_cleaner.kill_tags = reject_list
    return html_cleaner
Esempio n. 6
0
def html_strict_cleaning(html, allow_tags=['p', 'br', 'a', 'img', 'div']):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.allow_tags = allow_tags
    cleaner.remove_unknown_tags = False
    return lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(html)),
                              encoding='unicode')
Esempio n. 7
0
 def clean_article_html(cls, node):
     article_cleaner = Cleaner()
     article_cleaner.javascript = True
     article_cleaner.style = True
     article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b',
             'em', 'i', 'tt', 'code', 'pre', 'blockquote', 'img', 'h1',
             'h2', 'h3', 'h4', 'h5', 'h6']
     article_cleaner.remove_unknown_tags = False
     return article_cleaner.clean_html(node)
Esempio n. 8
0
def clean_article_html(cls, node):
    article_cleaner = Cleaner()
    article_cleaner.javascript = True
    article_cleaner.style = True
    article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'ul', 'ol', 'li',
                                  'em', 'i', 'code', 'pre', 'blockquote', 'h1',
                                  'h2', 'h3', 'h4', 'h5', 'h6']
    article_cleaner.remove_unknown_tags = False
    return article_cleaner.clean_html(node)
def get_cleaner():
    cleaner = Cleaner()
    cleaner.embedded = True
    cleaner.frames = True
    cleaner.style = True
    cleaner.remove_unknown_tags = True
    cleaner.processing_instructions = True
    cleaner.annoying_tags = True
    cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'b', 'a', 'u', 'i', 'body', 'div', 'span', 'p']
    cleaner.kill_tags = ['table', 'img', 'semantics', 'script', 'noscript', 'style', 'meta', 'label', 'li', 'ul',
                         'ol', 'sup', 'math', 'nav', 'dl', 'dd', 'sub']
    return cleaner
Esempio n. 10
0
    def get_locations(self) -> list:
        locations = []
        page = requests.get(self.link)
        tree = html.fromstring(page.text)
        tag = tree.xpath('//ul[@class="dropdown-menu"]')[7]
        c = Cleaner()
        c.allow_tags = 'a'
        c.remove_unknown_tags = False
        doc = etree.tostring(tag)
        for s in c.clean_html(doc).decode("utf-8").replace(
                "<div>", "").replace("</div>", "").strip().split('\n'):
            locations.append(
                Location(
                    regexTagContent.findall(s)[0].replace("<",
                                                          "").replace(">", ""),
                    regexHref.findall(s)[0]))

        return locations
Esempio n. 11
0
 def get_menu(self, location: str) -> Menu:
     food = []
     c = Cleaner()
     c.allow_tags = ['img']
     c.remove_unknown_tags = False
     page = requests.get(self.link + location)
     tree = html.fromstring(page.text)
     f = tree.xpath(
         '//div[@style="background-color:#ecf0f1;border-radius: 4px 4px 0px 0px; padding: 8px;"]'
     )
     doc = etree.tostring(f[0], pretty_print=True)
     t = c.clean_html(doc).decode("utf-8").replace("<div>", "").replace(
         "</div>", "")
     dtime = self.__extract_date(
         c.clean_html(doc).decode("utf-8").replace("<div>", "").replace(
             "</div>", "").strip().split("Essen "))
     food += self.__extract_food(regexFoodBlock.findall(t))
     #         foodList.append(f)
     return Menu(dtime, food)
Esempio n. 12
0
    def clean(self: T) -> str:
        cleaner = Cleaner()
        cleaner.style = self.__style
        cleaner.links = self.__links
        cleaner.page_structure = self.__page_structure
        cleaner.safe_attrs_only = self.__safe_attrs_only

        # allow_tags and remove_unknown_tags can't work together
        if self.__allow_tags is not None:
            cleaner.remove_unknown_tags = False
            cleaner.allow_tags = self.__allow_tags
        if self.__kill_tags is not None: cleaner.kill_tags = self.__kill_tags
        if self.__remove_tags is not None:
            cleaner.remove_tags = self.__remove_tags
        if self.__safe_attrs is not None:
            cleaner.safe_attrs = self.__safe_attrs

        self.__input = cleaner.clean_html(self.__input)
        return self.__input
Esempio n. 13
0
def main():
    html = "<div>Bill Gates: Microsoft Founder and Philanthropist: Microsoft Founder and ... - Marylou Morano Kjelle - Google BooksA privacy reminder from Google<a class=\"gb_od gb_7c\" tabindex=\"0\">Review now</a><a class=\"gb_od gb_nd\" tabindex=\"0\">I'll read this later</a><a class=\"gb_b gb_5b\" href=\"https://www.google.co.uk/intl/en/options/\" title=\"Google apps\" tabindex=\"0\"></a><a class=\"gb_O\" href=\"https://myaccount.google.com/?utm_source=OGB&amp;utm_medium=app\" id=\"gb192\">My Account</a><a class=\"gb_O\" href=\"https://www.google.co.uk/webhp?tab=pw&amp;ei=MjliWbTaCcPq-AHcsbOYAQ&amp;ved=0EKkuCAgoAQ\" id=\"gb1\">Search</a><a class=\"gb_O\" href=\"https://maps.google.co.uk/maps?hl=en&amp;tab=pl\" id=\"gb8\">Maps</a><a class=\"gb_O\" href=\"https://www.youtube.com/?gl=GB\" id=\"gb36\">YouTube</a><a class=\"gb_O\" href=\"https://play.google.com/?hl=en&amp;tab=p8\" id=\"gb78\">Play</a><a class=\"gb_O\" href=\"https://news.google.co.uk/nwshp?hl=en&amp;tab=pn&amp;ei=MjliWbTaCcPq-AHcsbOYAQ&amp;ved=0EKkuCAwoBQ\" id=\"gb5\">News</a><a class=\"gb_O\" href=\"https://mail.google.com/mail/?tab=pm\" id=\"gb23\">Gmail</a><a class=\"gb_O\" href=\"https://drive.google.com/?tab=po\" id=\"gb49\">Drive</a><a class=\"gb_O\" href=\"https://www.google.com/calendar?tab=pc\" id=\"gb24\">Calendar</a><a class=\"gb_O\" href=\"https://plus.google.com/?gpsrc=ogpy0&amp;tab=pX\" id=\"gb119\">Google+</a><a class=\"gb_O\" href=\"https://translate.google.co.uk/?hl=en&amp;tab=pT\" id=\"gb51\">Translate</a><a class=\"gb_O\" href=\"https://photos.google.com/?tab=pq&amp;pageId=none\" id=\"gb31\">Photos</a><a class=\"gb_ka gb_xf\" href=\"https://www.google.co.uk/intl/en/options/\">More</a><a class=\"gb_O\" href=\"http://www.google.co.uk/shopping?hl=en&amp;tab=pf&amp;ei=MjliWbTaCcPq-AHcsbOYAQ&amp;ved=0EKkuCBMoDA\" id=\"gb6\">Shopping</a><a class=\"gb_O\" href=\"https://www.google.co.uk/finance?tab=pe\" id=\"gb27\">Finance</a><a class=\"gb_O\" href=\"https://docs.google.com/document/?usp=docs_alc\" id=\"gb25\">Docs</a><a class=\"gb_O\" href=\"https://books.google.co.uk/bkshp?hl=en&amp;tab=pp&amp;ei=MjliWbTaCcPq-AHcsbOYAQ&amp;ved=0EKkuCBYoDw\" id=\"gb10\">Books</a><a class=\"gb_O\" href=\"https://www.blogger.com/?tab=pj\" id=\"gb30\">Blogger</a><a class=\"gb_O\" href=\"https://www.google.com/contacts/?hl=en&amp;tab=pC\" id=\"gb53\">Contacts</a><a class=\"gb_O\" href=\"https://hangouts.google.com/\" id=\"gb300\">Hangouts</a><a class=\"gb_O\" href=\"https://keep.google.com/\" id=\"gb136\">Keep</a><a class=\"gb_la gb_sf\" href=\"https://www.google.co.uk/intl/en/options/\">Even more from Google</a><a class=\"gb_Bf gb_Ha gb_xb\" id=\"gb_70\" href=\"https://www.google.com/accounts/Login?service=print&amp;continue=https://books.google.co.uk/books%3Fid%3Daf3PBQAAQBAJ%26pg%3DPA2%26lpg%3DPA2%26dq%3Dbillgatesmicrosoft%26source%3Dbl%26ots%3DkOoeyqnrmG%26sig%3DO8vNTHW0AmC039_nJsnKiEucONQ%26hl%3Den%26sa%3DX&amp;hl=en\" target=\"_top\">Sign in</a><a class=\"gb_5d gb_3b\" href=\"https://books.google.co.uk/bkshp?hl=en&amp;tab=pp\" title=\"Books\"></a>Hidden fields<a class=\"consentBumpSlowLink\" href=\"//consent.google.com/?hl=en&amp;continue=https://books.google.co.uk/books%3Fid%3Daf3PBQAAQBAJ%26pg%3DPA2%26lpg%3DPA2%26dq%3Dbillgatesmicrosoft%26source%3Dbl%26ots%3DkOoeyqnrmG%26sig%3DO8vNTHW0AmC039_nJsnKiEucONQ%26hl%3Den%26sa%3DX%26ved%3D0ahUKEwi3oKipsPzUAhWcF8AKHSXwAc8Q6AEIkgEwFQ&amp;pc=ogb&amp;wp=71&amp;l=1&amp;if=1&amp;fld=2&amp;origin=https://books.google.co.uk\" target=\"_top\">Load basic HTML</a>\u00a0(for slow connections)<a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;pg=PA2&amp;lpg=PA2&amp;dq=billgatesmicrosoft&amp;source=bl&amp;ots=kOoeyqnrmG&amp;sig=O8vNTHW0AmC039_nJsnKiEucONQ&amp;hl=en&amp;sa=X&amp;output=html_text\" title=\"Screen reader users: click this link for accessible mode. Accessible mode has the same essential features but works better with your reader.\"></a><a href=\"/books\">Books</a><a id=\"appbar-write-review-link\" href=\"https://www.google.com/accounts/Login?service=print&amp;continue=https://books.google.co.uk/books%3Fop%3Dlookup%26id%3Daf3PBQAAQBAJ%26continue%3Dhttps://books.google.co.uk/books%253Fid%253Daf3PBQAAQBAJ%2526pg%253DPA2%2526lpg%253DPA2%2526dq%253Dbillgatesmicrosoft%2526source%253Dbl%2526ots%253DkOoeyqnrmG%2526sig%253DO8vNTHW0AmC039_nJsnKiEucONQ%2526hl%253Den%2526sa%253DX&amp;hl=en\"></a><a id=\"appbar-view-print-sample-link\" href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;printsec=frontcover&amp;source=gbs_vpt_read\"></a><a id=\"appbar-view-ebook-sample-link\" href=\"https://play.google.com/books/reader?id=af3PBQAAQBAJ&amp;printsec=frontcover&amp;source=gbs_vpt_read\"></a><a id=\"appbar-patents-prior-art-finder-link\" href=\"\"></a><a id=\"appbar-patents-discuss-this-link\" href=\"\"></a><a id=\"appbar-read-patent-link\" href=\"\"></a><a id=\"appbar-download-pdf-link\" href=\"\"></a>books.google.co.uk - Learn who Bill Gates is, how Microsoft got its start, where it\u00e2\u0080\u0099s heading, and much more. Primary sources with accompanying questions, multiple prompts, timeline, index, and glossary also included. Core Library is an imprint of Abdo Publishing Company....https://books.google.co.uk/books/about/Bill_Gates_Microsoft_Founder_and_Philant.html?id=af3PBQAAQBAJ&amp;utm_source=gb-gplus-shareBill Gates: Microsoft Founder and Philanthropist<a class=\"gbmt goog-menuitem-content\" id=\"\" href=\"https://www.google.com/accounts/Login?service=print&amp;continue=https://books.google.co.uk/books%3Fop%3Dlibrary&amp;hl=en\">My library</a><a class=\"gbmt goog-menuitem-content\" id=\"\" href=\"http://books.google.co.uk/support/topic/4359341?hl=en-GB\">Help</a><a class=\"gbmt goog-menuitem-content\" id=\"\" href=\"https://books.google.co.uk/advanced_book_search?q=billgatesmicrosoft\">Advanced Book Search</a><a href=\"https://play.google.com/store/books/details?id=af3PBQAAQBAJ&amp;rdid=book-af3PBQAAQBAJ&amp;rdot=1&amp;source=gbs_vpt_read&amp;pcampaignid=books_booksearch_viewport\" id=\"gb-get-book-content\">Buy eBook - \u00c2\u00a321.79</a><p id=\"gb-buy-options-trigger\" class=\"gb-buy-options-link\">Get this book in print</p><a name=\"buy_anchor\"></a><a href=\"http://abdopublishing.com/shop/show/6322\" dir=\"ltr\">ABDO</a><a href=\"http://www.amazon.co.uk/gp/search?index=books&amp;linkCode=qs&amp;keywords=9781629694603\" dir=\"ltr\">Amazon.co.uk</a><a href=\"http://www.bookdepository.com/book/9781629694603\" dir=\"ltr\">BookDepository</a><a href=\"http://www.waterstones.com/waterstonesweb/advancedSearch.do?buttonClicked=2&amp;isbn=1629694606\" dir=\"ltr\">Waterstone's</a><a href=\"http://www.whsmith.co.uk/CatalogAndSearch/SearchWithinCategory.aspx?as_ISBN=1629694606\" dir=\"ltr\">WHSmith</a><a href=\"http://bookshop.blackwell.co.uk/bobuk/scripts/home.jsp?action=search&amp;type=isbn&amp;term=1629694606\" dir=\"ltr\">Blackwell</a><a href=\"https://books.google.co.uk/url?id=af3PBQAAQBAJ&amp;pg=PA2&amp;q=http://worldcat.org/isbn/1629694606&amp;clientid=librarylink&amp;usg=AFQjCNF30N2K8V8cLKFtxjpcHRSt5RPkwg&amp;source=gbs_buy_r\">Find in a library</a><a class=\"secondary\" href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;sitesec=buy&amp;source=gbs_buy_r\" id=\"get-all-sellers-link\">All sellers\u00a0\u00bb</a><a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;printsec=frontcover\"></a><a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;dq=billgatesmicrosoft&amp;sitesec=reviews\"></a> <a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;dq=billgatesmicrosoft&amp;sitesec=reviews\" class=\"sbs-count secondary\">0 Reviews</a><a id=\"write-review-link\" href=\"https://www.google.com/accounts/Login?service=print&amp;continue=https://books.google.co.uk/books%3Fop%3Dlookup%26id%3Daf3PBQAAQBAJ%26continue%3Dhttps://books.google.co.uk/books%253Fid%253Daf3PBQAAQBAJ%2526pg%253DPA2%2526lpg%253DPA2%2526dq%253Dbillgatesmicrosoft%2526source%253Dbl%2526ots%253DkOoeyqnrmG%2526sig%253DO8vNTHW0AmC039_nJsnKiEucONQ%2526hl%253Den%2526sa%253DX&amp;hl=en\" class=\"secondary sbs-link\">Write review</a>https://books.google.com/books/about/Bill_Gates_Microsoft_Founder_and_Philant.html?id=af3PBQAAQBAJBill Gates: Microsoft Founder and Philanthropist: Microsoft Founder and ...By Marylou Morano Kjelle \u00a0<p><a id=\"sidebar-atb-link\" href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;dq=billgatesmicrosoft&amp;source=gbs_navlinks_s\">About this book</a></p><a href=\"/intl/en/googlebooks/tos.html\" target=\"_blank\">Terms\u00a0of\u00a0Service</a><a name=\"pub_info_anchor\"></a><a href=\"https://books.google.co.uk/url?id=af3PBQAAQBAJ&amp;pg=PA2&amp;q=http://www.abdopublishing.com/index.html&amp;linkid=1&amp;usg=AFQjCNEc5mprg92y9GPuvfY6s-WXez4caQ&amp;source=gbs_pub_info_r\"></a>Pages displayed by permission of <a class=\"link_aux\" href=\"https://books.google.co.uk/url?id=af3PBQAAQBAJ&amp;pg=PA2&amp;q=http://www.abdopublishing.com/index.html&amp;linkid=1&amp;usg=AFQjCNEc5mprg92y9GPuvfY6s-WXez4caQ&amp;source=gbs_pub_info_r\">ABDO</a>.\u00a0<a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;printsec=copyright&amp;source=gbs_pub_info_r\">Copyright</a>.\u00a0Page 2<a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;pg=PA1&amp;lpg=PA2&amp;ots=kOoeyqnrmG&amp;focus=viewport&amp;dq=billgatesmicrosoft\"></a><a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;pg=PA3&amp;lpg=PA2&amp;ots=kOoeyqnrmG&amp;focus=viewport&amp;dq=billgatesmicrosoft\"></a>\u00a0\u00a0<a name=\"page\" accesskey=\"c\"></a><a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;pg=PA3&amp;lpg=PA2&amp;ots=kOoeyqnrmG&amp;focus=viewport&amp;dq=billgatesmicrosoft\"></a></div>"
    #print('Original HTML: ' + html[0:80])
    c = Cleaner(
        scripts=True,
        javascript=True,
        comments=True,
        style=True,
        links=True,
        meta=True,
        page_structure=True,
        processing_instructions=True,
        embedded=True,
        frames=True,
        forms=True,
        annoying_tags=True,
    )
    c.allow_tags = None
    c.remove_unknown_tags = True
    html = c.clean_html(html)
    print('Cleaned up HTML: ' + str(html))
Esempio n. 14
0
def html2content(
    html,
    allowed_tags=[
        "a", "abbr", "article", "aside", "b", "base", "blockquote", "body",
        "br", "caption", "cite", "code", "col", "colgroup", "dd", "del", "dfn",
        "dl", "dt", "em", "embed", "figcaption", "figure", "footer", "h1",
        "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html",
        "i", "img", "li", "map", "mark", "math", "meta", "meter", "nav",
        "noscript", "object", "ol", "optgroup", "option", "output", "p",
        "param", "pre", "progress", "q", "rp", "rt", "ruby", "s", "samp",
        "section", "small", "source", "span", "strong", "sub", "sup", "svg",
        "table", "tbody", "td", "th", "thead", "tfoot", "time", "title", "tr",
        "track", "u", "ul", "var", "video", "wbr"
    ]):
    cleaner = Cleaner()
    cleaner.allow_tags = allowed_tags
    cleaner.remove_unknown_tags = False
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.style = True
    cleaner.embeded = False
    return cleaner.clean_html(html)
Esempio n. 15
0
    if nb_upper > nb_lower:
        return titlecase(title)
    else:
        return title


# HTML sanitizing for the title

overescaped_re = re.compile(r'&amp;#(\d+);')
unicode4_re = re.compile(r'(\\u[0-9A-Z]{4})(?![0-9A-Z])')
whitespace_re = re.compile(r'\s+')
ltgt_re = re.compile(r'.*[<>&]')

html_cleaner = Cleaner()
html_cleaner.allow_tags = ['sub', 'sup', 'b', 'span']
html_cleaner.remove_unknown_tags = False

html_killer = Cleaner()
html_killer.allow_tags = ['div']
html_killer.remove_unknown_tags = False

latexmath_re = re.compile(r'\$(\S[^$]*?\S|\S)\$')


def remove_latex_math_dollars(string):
    """
    Removes LaTeX dollar tags.

    >>> remove_latex_math_dollars(u'This is $\\\\beta$-reduction explained')
    u'This is \\\\beta-reduction explained'
    >>> remove_latex_math_dollars(u'Compare $\\\\frac{2}{3}$ to $\\\\pi$')
Esempio n. 16
0
        elif title[i].islower():
            nb_lower += 1

    if nb_upper > nb_lower:
        title = titlecase(title)
    return title

## HTML sanitizing for the title

overescaped_re = re.compile(r'&amp;#(\d+);')
unicode4_re = re.compile(r'(\\u[0-9A-Z]{4})(?![0-9A-Z])')
whitespace_re = re.compile(r'\s+')

html_cleaner = Cleaner()
html_cleaner.allow_tags = ['sub','sup','b','span']
html_cleaner.remove_unknown_tags = False

html_killer = Cleaner()
html_killer.allow_tags = ['div']
html_killer.remove_unknown_tags = False

latexmath_re = re.compile(r'\$(\S[^$]*?\S|\S)\$')
def remove_latex_math_dollars(string):
    return latexmath_re.sub(r'\1', string)

latex_command_re = re.compile(r'(\\([a-zA-Z]+|[.=\'"])({[^}]*})*)')
def unescape_latex(s):
    def conditional_replace(fragment):
        rep = unicode_tex.tex_to_unicode_map.get(fragment.group(0))
        return rep if rep is not None else fragment.group(0)
Esempio n. 17
0
# HTML_CLEANER config
# http://lxml.de/api/lxml.html.clean.Cleaner-class.html
# https://lxml.de/apidoc/lxml.html.clean.html
HTML_CLEANER = Cleaner()
HTML_CLEANER.annoying_tags = False  # True
HTML_CLEANER.comments = True
HTML_CLEANER.embedded = False  # True
HTML_CLEANER.forms = False  # True
HTML_CLEANER.frames = False  # True
HTML_CLEANER.javascript = False
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False
HTML_CLEANER.style = False
#HTML_CLEANER.remove_tags = MANUALLY_STRIPPED
#HTML_CLEANER.kill_tags = MANUALLY_CLEANED


def tree_cleaning(tree, include_tables, include_images=False):
    '''Prune the tree by discarding unwanted elements'''
    # determine cleaning strategy
    cleaning_list, stripping_list = \
        MANUALLY_CLEANED.copy(), MANUALLY_STRIPPED.copy()
    if include_tables is False:
        cleaning_list.append('table')
    if include_images is True:
}

logger.debug('settings: %s %s %s', MIN_YEAR, TODAY, MAX_YEAR)
logger.debug('dateparser configuration: %s', PARSERCONFIG)

cleaner = Cleaner()
cleaner.comments = True
cleaner.embedded = True
cleaner.forms = False
cleaner.frames = True
cleaner.javascript = False
cleaner.links = False
cleaner.meta = False
cleaner.page_structure = True
cleaner.processing_instructions = True
cleaner.remove_unknown_tags = False
cleaner.safe_attrs_only = False
cleaner.scripts = False
cleaner.style = False
cleaner.kill_tags = [
    'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'table',
    'svg', 'video'
]
# 'embed', 'figure', 'img',


def date_validator(datestring, outputformat):
    """Validate a string with respect to the chosen outputformat and basic heuristics"""
    # try if date can be parsed using chosen outputformat
    try:
        dateobject = datetime.datetime.strptime(datestring, outputformat)
Esempio n. 19
0
def google_news_cut(link):
    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True  # This is True because we want to activate the styles & stylesheet filter

    page = get_web_page(link)
    soup = BeautifulSoup(page, 'html.parser')
    # all_news = soup.find_all('a', 'nuEeue hzdq5d ME7ew')
    all_news = soup.find_all('a', 'ipQwMb Q7tWef')
    key_str = ""
    titles_link = []
    word_t_list = []
    documents = []
    for news in all_news:
        # print(news.string)
        # print(news['href'])
        if re.match('\./', news['href']) is None:
            link = news['href']
        else:
            link = 'https://news.google.com/' + re.sub('\./', "", news['href'])
        titles_link.append({'title': news.string, 'link': link})
        key_str = key_str + news.string + "\n"

    remove_words = [
        'mlb', 'nba', '新聞網', '中央社', '報紙', '聯合', '時報', '全網', '自己', '中時', '年月日',
        '直播', '三立', '聞網', '使用者', '中國時報', '自由時報', '關鍵字', '網站', '發表', '留言', '發言',
        '網小時', '自由'
    ]

    jieba.load_userdict("my_dict.txt")
    jieba.load_userdict("news_dict.txt")
    jieba.analyse.set_stop_words("stop_words.txt")
    jieba.analyse.set_stop_words("stop_words_sport.txt")

    for t_link in titles_link:

        print('get_web_page: ', t_link['title'], " ", t_link['link'])
        try:
            page = get_web_page_html(t_link['link'])
            # page = get_web_page(t_link['link'])
        except requests.exceptions.SSLError:
            continue
        except lxml.etree.ParserError:
            continue
        if page is None:
            continue
        cleaner.kill_tags = ['a', 'img']
        cleaner.remove_tags = ['div', 'p']
        cleaner.remove_unknown_tags = False
        cleaner.allow_tags = ['p']
        result = html.tostring(cleaner.clean_html(page),
                               encoding="utf-8",
                               pretty_print=True,
                               method="html")
        article_content = re.sub('&#13;', "", result.decode('utf-8'))

        #
        article_content = re.sub(u'[^\u4E00-\u9FA5]', " ", article_content)
        article_content = re.sub(r'[\n\xa0\W你妳我他她它們]', "", article_content)
        article_content = re.sub('自己', "", article_content)
        # print(article_content)
        words_t = jieba.cut(article_content, cut_all=False)
        word_t_list = [word for word in words_t if word not in remove_words]
        print(word_t_list)
        documents.append(word_t_list)
    return documents