Esempio n. 1
0
 def clean_article_html(cls, node):
     article_cleaner = Cleaner()
     article_cleaner.javascript = True
     article_cleaner.style = True
     article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'em']
     article_cleaner.remove_unknown_tags = False
     return article_cleaner.clean_html(node)
Esempio n. 2
0
def html2content(html, allowed_tags=["a", "abbr", "article", "aside",
                                     "b", "base", "blockquote", "body",
                                     "br", "caption", "cite", "code", "col", "colgroup",
                                     "dd", "del", "dfn", "dl", "dt",
                                     "em", "embed", "figcaption", "figure", "footer",
                                     "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html",
                                     "i", "img",
                                     "li",
                                     "map", "mark", "math", "meta", "meter",
                                     "nav", "noscript",
                                     "object", "ol", "optgroup", "option", "output",
                                     "p", "param", "pre", "progress",
                                     "q", "rp", "rt", "ruby",
                                     "s", "samp", "section", "small", "source", "span", "strong", "sub", "sup", "svg",
                                     "table", "tbody", "td", "th", "thead", "tfoot", "time", "title", "tr", "track",
                                     "u", "ul",
                                     "var", "video",
                                     "wbr"]):
    cleaner = Cleaner()
    cleaner.allow_tags = allowed_tags
    cleaner.remove_unknown_tags = False
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.style = True
    cleaner.embeded = False
    return cleaner.clean_html(html)
Esempio n. 3
0
def html_strict_cleaning(html, allow_tags=['p', 'br', 'a', 'img', 'div']):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.allow_tags = allow_tags
    cleaner.remove_unknown_tags = False
    return lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(html)),
                              encoding='unicode')
Esempio n. 4
0
 def clean_article_html(cls, node):
     article_cleaner = Cleaner()
     article_cleaner.javascript = True
     article_cleaner.style = True
     article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b',
             'em', 'i', 'tt', 'code', 'pre', 'blockquote', 'img', 'h1',
             'h2', 'h3', 'h4', 'h5', 'h6']
     article_cleaner.remove_unknown_tags = False
     return article_cleaner.clean_html(node)
Esempio n. 5
0
def parse_content(content):
    html_img_cleaner = Cleaner(allow_tags=["img"], remove_unknown_tags=False)
    html_img_cleaner.allow_tags = ["img"]

    xml_parser = lxml.etree.XMLParser(
        remove_blank_text=True, ns_clean=True, encoding="utf-8"
    )

    return lxml.etree.XML("<div>" + escape(content) + "</div>", xml_parser)
Esempio n. 6
0
def clean_article_html(cls, node):
    article_cleaner = Cleaner()
    article_cleaner.javascript = True
    article_cleaner.style = True
    article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'ul', 'ol', 'li',
                                  'em', 'i', 'code', 'pre', 'blockquote', 'h1',
                                  'h2', 'h3', 'h4', 'h5', 'h6']
    article_cleaner.remove_unknown_tags = False
    return article_cleaner.clean_html(node)
Esempio n. 7
0
	def clean(self):
		cleaner= Cleaner(page_structure=False)
		cleaner.javascript = True
		cleaner.scripts = True
		cleaner.frames = True
		cleaner.allow_tags = []
		cleaner.remove_tags = ['p', 'div', 'a']
		self.name= (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content()
		self.price = (lxml.html.document_fromstring(cleaner.clean_html(self.price))).text_content()
		self.discountcode = (lxml.html.document_fromstring(cleaner.clean_html(self.discountcode))).text_content()
		self.categorycode= (lxml.html.document_fromstring(cleaner.clean_html(self.categorycode))).text_content()
		self.orderdate= (lxml.html.document_fromstring(cleaner.clean_html(self.orderdate))).text_content()
		self.selldate= (lxml.html.document_fromstring(cleaner.clean_html(self.selldate))).text_content()
		self.page= (lxml.html.document_fromstring(cleaner.clean_html(self.page))).text_content()		
Esempio n. 8
0
    def clearTag(self, text: str) -> str:
        import lxml
        from lxml.html.clean import Cleaner

        try:
            cleaner = Cleaner(remove_unknown_tags=False)
            cleaner.allow_tags = [""]
            return (lxml.html.tostring(
                cleaner.clean_html(lxml.html.document_fromstring(text)),
                encoding="unicode",
            ).replace("<div>", "").replace("</div>",
                                           "").replace("&#13;", "").replace(
                                               "'", "\\'").rstrip("\n\n"))
        except:
            return ""
Esempio n. 9
0
    def get_locations(self) -> list:
        locations = []
        page = requests.get(self.link)
        tree = html.fromstring(page.text)
        tag = tree.xpath('//ul[@class="dropdown-menu"]')[7]
        c = Cleaner()
        c.allow_tags = 'a'
        c.remove_unknown_tags = False
        doc = etree.tostring(tag)
        for s in c.clean_html(doc).decode("utf-8").replace(
                "<div>", "").replace("</div>", "").strip().split('\n'):
            locations.append(
                Location(
                    regexTagContent.findall(s)[0].replace("<",
                                                          "").replace(">", ""),
                    regexHref.findall(s)[0]))

        return locations
Esempio n. 10
0
 def get_menu(self, location: str) -> Menu:
     food = []
     c = Cleaner()
     c.allow_tags = ['img']
     c.remove_unknown_tags = False
     page = requests.get(self.link + location)
     tree = html.fromstring(page.text)
     f = tree.xpath(
         '//div[@style="background-color:#ecf0f1;border-radius: 4px 4px 0px 0px; padding: 8px;"]'
     )
     doc = etree.tostring(f[0], pretty_print=True)
     t = c.clean_html(doc).decode("utf-8").replace("<div>", "").replace(
         "</div>", "")
     dtime = self.__extract_date(
         c.clean_html(doc).decode("utf-8").replace("<div>", "").replace(
             "</div>", "").strip().split("Essen "))
     food += self.__extract_food(regexFoodBlock.findall(t))
     #         foodList.append(f)
     return Menu(dtime, food)
Esempio n. 11
0
    def clean(self: T) -> str:
        cleaner = Cleaner()
        cleaner.style = self.__style
        cleaner.links = self.__links
        cleaner.page_structure = self.__page_structure
        cleaner.safe_attrs_only = self.__safe_attrs_only

        # allow_tags and remove_unknown_tags can't work together
        if self.__allow_tags is not None:
            cleaner.remove_unknown_tags = False
            cleaner.allow_tags = self.__allow_tags
        if self.__kill_tags is not None: cleaner.kill_tags = self.__kill_tags
        if self.__remove_tags is not None:
            cleaner.remove_tags = self.__remove_tags
        if self.__safe_attrs is not None:
            cleaner.safe_attrs = self.__safe_attrs

        self.__input = cleaner.clean_html(self.__input)
        return self.__input
Esempio n. 12
0
def main():
    html = "<div>Bill Gates: Microsoft Founder and Philanthropist: Microsoft Founder and ... - Marylou Morano Kjelle - Google BooksA privacy reminder from Google<a class=\"gb_od gb_7c\" tabindex=\"0\">Review now</a><a class=\"gb_od gb_nd\" tabindex=\"0\">I'll read this later</a><a class=\"gb_b gb_5b\" href=\"https://www.google.co.uk/intl/en/options/\" title=\"Google apps\" tabindex=\"0\"></a><a class=\"gb_O\" href=\"https://myaccount.google.com/?utm_source=OGB&amp;utm_medium=app\" id=\"gb192\">My Account</a><a class=\"gb_O\" href=\"https://www.google.co.uk/webhp?tab=pw&amp;ei=MjliWbTaCcPq-AHcsbOYAQ&amp;ved=0EKkuCAgoAQ\" id=\"gb1\">Search</a><a class=\"gb_O\" href=\"https://maps.google.co.uk/maps?hl=en&amp;tab=pl\" id=\"gb8\">Maps</a><a class=\"gb_O\" href=\"https://www.youtube.com/?gl=GB\" id=\"gb36\">YouTube</a><a class=\"gb_O\" href=\"https://play.google.com/?hl=en&amp;tab=p8\" id=\"gb78\">Play</a><a class=\"gb_O\" href=\"https://news.google.co.uk/nwshp?hl=en&amp;tab=pn&amp;ei=MjliWbTaCcPq-AHcsbOYAQ&amp;ved=0EKkuCAwoBQ\" id=\"gb5\">News</a><a class=\"gb_O\" href=\"https://mail.google.com/mail/?tab=pm\" id=\"gb23\">Gmail</a><a class=\"gb_O\" href=\"https://drive.google.com/?tab=po\" id=\"gb49\">Drive</a><a class=\"gb_O\" href=\"https://www.google.com/calendar?tab=pc\" id=\"gb24\">Calendar</a><a class=\"gb_O\" href=\"https://plus.google.com/?gpsrc=ogpy0&amp;tab=pX\" id=\"gb119\">Google+</a><a class=\"gb_O\" href=\"https://translate.google.co.uk/?hl=en&amp;tab=pT\" id=\"gb51\">Translate</a><a class=\"gb_O\" href=\"https://photos.google.com/?tab=pq&amp;pageId=none\" id=\"gb31\">Photos</a><a class=\"gb_ka gb_xf\" href=\"https://www.google.co.uk/intl/en/options/\">More</a><a class=\"gb_O\" href=\"http://www.google.co.uk/shopping?hl=en&amp;tab=pf&amp;ei=MjliWbTaCcPq-AHcsbOYAQ&amp;ved=0EKkuCBMoDA\" id=\"gb6\">Shopping</a><a class=\"gb_O\" href=\"https://www.google.co.uk/finance?tab=pe\" id=\"gb27\">Finance</a><a class=\"gb_O\" href=\"https://docs.google.com/document/?usp=docs_alc\" id=\"gb25\">Docs</a><a class=\"gb_O\" href=\"https://books.google.co.uk/bkshp?hl=en&amp;tab=pp&amp;ei=MjliWbTaCcPq-AHcsbOYAQ&amp;ved=0EKkuCBYoDw\" id=\"gb10\">Books</a><a class=\"gb_O\" href=\"https://www.blogger.com/?tab=pj\" id=\"gb30\">Blogger</a><a class=\"gb_O\" href=\"https://www.google.com/contacts/?hl=en&amp;tab=pC\" id=\"gb53\">Contacts</a><a class=\"gb_O\" href=\"https://hangouts.google.com/\" id=\"gb300\">Hangouts</a><a class=\"gb_O\" href=\"https://keep.google.com/\" id=\"gb136\">Keep</a><a class=\"gb_la gb_sf\" href=\"https://www.google.co.uk/intl/en/options/\">Even more from Google</a><a class=\"gb_Bf gb_Ha gb_xb\" id=\"gb_70\" href=\"https://www.google.com/accounts/Login?service=print&amp;continue=https://books.google.co.uk/books%3Fid%3Daf3PBQAAQBAJ%26pg%3DPA2%26lpg%3DPA2%26dq%3Dbillgatesmicrosoft%26source%3Dbl%26ots%3DkOoeyqnrmG%26sig%3DO8vNTHW0AmC039_nJsnKiEucONQ%26hl%3Den%26sa%3DX&amp;hl=en\" target=\"_top\">Sign in</a><a class=\"gb_5d gb_3b\" href=\"https://books.google.co.uk/bkshp?hl=en&amp;tab=pp\" title=\"Books\"></a>Hidden fields<a class=\"consentBumpSlowLink\" href=\"//consent.google.com/?hl=en&amp;continue=https://books.google.co.uk/books%3Fid%3Daf3PBQAAQBAJ%26pg%3DPA2%26lpg%3DPA2%26dq%3Dbillgatesmicrosoft%26source%3Dbl%26ots%3DkOoeyqnrmG%26sig%3DO8vNTHW0AmC039_nJsnKiEucONQ%26hl%3Den%26sa%3DX%26ved%3D0ahUKEwi3oKipsPzUAhWcF8AKHSXwAc8Q6AEIkgEwFQ&amp;pc=ogb&amp;wp=71&amp;l=1&amp;if=1&amp;fld=2&amp;origin=https://books.google.co.uk\" target=\"_top\">Load basic HTML</a>\u00a0(for slow connections)<a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;pg=PA2&amp;lpg=PA2&amp;dq=billgatesmicrosoft&amp;source=bl&amp;ots=kOoeyqnrmG&amp;sig=O8vNTHW0AmC039_nJsnKiEucONQ&amp;hl=en&amp;sa=X&amp;output=html_text\" title=\"Screen reader users: click this link for accessible mode. Accessible mode has the same essential features but works better with your reader.\"></a><a href=\"/books\">Books</a><a id=\"appbar-write-review-link\" href=\"https://www.google.com/accounts/Login?service=print&amp;continue=https://books.google.co.uk/books%3Fop%3Dlookup%26id%3Daf3PBQAAQBAJ%26continue%3Dhttps://books.google.co.uk/books%253Fid%253Daf3PBQAAQBAJ%2526pg%253DPA2%2526lpg%253DPA2%2526dq%253Dbillgatesmicrosoft%2526source%253Dbl%2526ots%253DkOoeyqnrmG%2526sig%253DO8vNTHW0AmC039_nJsnKiEucONQ%2526hl%253Den%2526sa%253DX&amp;hl=en\"></a><a id=\"appbar-view-print-sample-link\" href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;printsec=frontcover&amp;source=gbs_vpt_read\"></a><a id=\"appbar-view-ebook-sample-link\" href=\"https://play.google.com/books/reader?id=af3PBQAAQBAJ&amp;printsec=frontcover&amp;source=gbs_vpt_read\"></a><a id=\"appbar-patents-prior-art-finder-link\" href=\"\"></a><a id=\"appbar-patents-discuss-this-link\" href=\"\"></a><a id=\"appbar-read-patent-link\" href=\"\"></a><a id=\"appbar-download-pdf-link\" href=\"\"></a>books.google.co.uk - Learn who Bill Gates is, how Microsoft got its start, where it\u00e2\u0080\u0099s heading, and much more. Primary sources with accompanying questions, multiple prompts, timeline, index, and glossary also included. Core Library is an imprint of Abdo Publishing Company....https://books.google.co.uk/books/about/Bill_Gates_Microsoft_Founder_and_Philant.html?id=af3PBQAAQBAJ&amp;utm_source=gb-gplus-shareBill Gates: Microsoft Founder and Philanthropist<a class=\"gbmt goog-menuitem-content\" id=\"\" href=\"https://www.google.com/accounts/Login?service=print&amp;continue=https://books.google.co.uk/books%3Fop%3Dlibrary&amp;hl=en\">My library</a><a class=\"gbmt goog-menuitem-content\" id=\"\" href=\"http://books.google.co.uk/support/topic/4359341?hl=en-GB\">Help</a><a class=\"gbmt goog-menuitem-content\" id=\"\" href=\"https://books.google.co.uk/advanced_book_search?q=billgatesmicrosoft\">Advanced Book Search</a><a href=\"https://play.google.com/store/books/details?id=af3PBQAAQBAJ&amp;rdid=book-af3PBQAAQBAJ&amp;rdot=1&amp;source=gbs_vpt_read&amp;pcampaignid=books_booksearch_viewport\" id=\"gb-get-book-content\">Buy eBook - \u00c2\u00a321.79</a><p id=\"gb-buy-options-trigger\" class=\"gb-buy-options-link\">Get this book in print</p><a name=\"buy_anchor\"></a><a href=\"http://abdopublishing.com/shop/show/6322\" dir=\"ltr\">ABDO</a><a href=\"http://www.amazon.co.uk/gp/search?index=books&amp;linkCode=qs&amp;keywords=9781629694603\" dir=\"ltr\">Amazon.co.uk</a><a href=\"http://www.bookdepository.com/book/9781629694603\" dir=\"ltr\">BookDepository</a><a href=\"http://www.waterstones.com/waterstonesweb/advancedSearch.do?buttonClicked=2&amp;isbn=1629694606\" dir=\"ltr\">Waterstone's</a><a href=\"http://www.whsmith.co.uk/CatalogAndSearch/SearchWithinCategory.aspx?as_ISBN=1629694606\" dir=\"ltr\">WHSmith</a><a href=\"http://bookshop.blackwell.co.uk/bobuk/scripts/home.jsp?action=search&amp;type=isbn&amp;term=1629694606\" dir=\"ltr\">Blackwell</a><a href=\"https://books.google.co.uk/url?id=af3PBQAAQBAJ&amp;pg=PA2&amp;q=http://worldcat.org/isbn/1629694606&amp;clientid=librarylink&amp;usg=AFQjCNF30N2K8V8cLKFtxjpcHRSt5RPkwg&amp;source=gbs_buy_r\">Find in a library</a><a class=\"secondary\" href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;sitesec=buy&amp;source=gbs_buy_r\" id=\"get-all-sellers-link\">All sellers\u00a0\u00bb</a><a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;printsec=frontcover\"></a><a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;dq=billgatesmicrosoft&amp;sitesec=reviews\"></a> <a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;dq=billgatesmicrosoft&amp;sitesec=reviews\" class=\"sbs-count secondary\">0 Reviews</a><a id=\"write-review-link\" href=\"https://www.google.com/accounts/Login?service=print&amp;continue=https://books.google.co.uk/books%3Fop%3Dlookup%26id%3Daf3PBQAAQBAJ%26continue%3Dhttps://books.google.co.uk/books%253Fid%253Daf3PBQAAQBAJ%2526pg%253DPA2%2526lpg%253DPA2%2526dq%253Dbillgatesmicrosoft%2526source%253Dbl%2526ots%253DkOoeyqnrmG%2526sig%253DO8vNTHW0AmC039_nJsnKiEucONQ%2526hl%253Den%2526sa%253DX&amp;hl=en\" class=\"secondary sbs-link\">Write review</a>https://books.google.com/books/about/Bill_Gates_Microsoft_Founder_and_Philant.html?id=af3PBQAAQBAJBill Gates: Microsoft Founder and Philanthropist: Microsoft Founder and ...By Marylou Morano Kjelle \u00a0<p><a id=\"sidebar-atb-link\" href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;dq=billgatesmicrosoft&amp;source=gbs_navlinks_s\">About this book</a></p><a href=\"/intl/en/googlebooks/tos.html\" target=\"_blank\">Terms\u00a0of\u00a0Service</a><a name=\"pub_info_anchor\"></a><a href=\"https://books.google.co.uk/url?id=af3PBQAAQBAJ&amp;pg=PA2&amp;q=http://www.abdopublishing.com/index.html&amp;linkid=1&amp;usg=AFQjCNEc5mprg92y9GPuvfY6s-WXez4caQ&amp;source=gbs_pub_info_r\"></a>Pages displayed by permission of <a class=\"link_aux\" href=\"https://books.google.co.uk/url?id=af3PBQAAQBAJ&amp;pg=PA2&amp;q=http://www.abdopublishing.com/index.html&amp;linkid=1&amp;usg=AFQjCNEc5mprg92y9GPuvfY6s-WXez4caQ&amp;source=gbs_pub_info_r\">ABDO</a>.\u00a0<a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;printsec=copyright&amp;source=gbs_pub_info_r\">Copyright</a>.\u00a0Page 2<a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;pg=PA1&amp;lpg=PA2&amp;ots=kOoeyqnrmG&amp;focus=viewport&amp;dq=billgatesmicrosoft\"></a><a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;pg=PA3&amp;lpg=PA2&amp;ots=kOoeyqnrmG&amp;focus=viewport&amp;dq=billgatesmicrosoft\"></a>\u00a0\u00a0<a name=\"page\" accesskey=\"c\"></a><a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&amp;pg=PA3&amp;lpg=PA2&amp;ots=kOoeyqnrmG&amp;focus=viewport&amp;dq=billgatesmicrosoft\"></a></div>"
    #print('Original HTML: ' + html[0:80])
    c = Cleaner(
        scripts=True,
        javascript=True,
        comments=True,
        style=True,
        links=True,
        meta=True,
        page_structure=True,
        processing_instructions=True,
        embedded=True,
        frames=True,
        forms=True,
        annoying_tags=True,
    )
    c.allow_tags = None
    c.remove_unknown_tags = True
    html = c.clean_html(html)
    print('Cleaned up HTML: ' + str(html))
Esempio n. 13
0
def html2content(
    html,
    allowed_tags=[
        "a", "abbr", "article", "aside", "b", "base", "blockquote", "body",
        "br", "caption", "cite", "code", "col", "colgroup", "dd", "del", "dfn",
        "dl", "dt", "em", "embed", "figcaption", "figure", "footer", "h1",
        "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html",
        "i", "img", "li", "map", "mark", "math", "meta", "meter", "nav",
        "noscript", "object", "ol", "optgroup", "option", "output", "p",
        "param", "pre", "progress", "q", "rp", "rt", "ruby", "s", "samp",
        "section", "small", "source", "span", "strong", "sub", "sup", "svg",
        "table", "tbody", "td", "th", "thead", "tfoot", "time", "title", "tr",
        "track", "u", "ul", "var", "video", "wbr"
    ]):
    cleaner = Cleaner()
    cleaner.allow_tags = allowed_tags
    cleaner.remove_unknown_tags = False
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.style = True
    cleaner.embeded = False
    return cleaner.clean_html(html)
Esempio n. 14
0
def google_news_cut(link):
    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True  # This is True because we want to activate the styles & stylesheet filter

    page = get_web_page(link)
    soup = BeautifulSoup(page, 'html.parser')
    # all_news = soup.find_all('a', 'nuEeue hzdq5d ME7ew')
    all_news = soup.find_all('a', 'ipQwMb Q7tWef')
    key_str = ""
    titles_link = []
    word_t_list = []
    documents = []
    for news in all_news:
        # print(news.string)
        # print(news['href'])
        if re.match('\./', news['href']) is None:
            link = news['href']
        else:
            link = 'https://news.google.com/' + re.sub('\./', "", news['href'])
        titles_link.append({'title': news.string, 'link': link})
        key_str = key_str + news.string + "\n"

    remove_words = [
        'mlb', 'nba', '新聞網', '中央社', '報紙', '聯合', '時報', '全網', '自己', '中時', '年月日',
        '直播', '三立', '聞網', '使用者', '中國時報', '自由時報', '關鍵字', '網站', '發表', '留言', '發言',
        '網小時', '自由'
    ]

    jieba.load_userdict("my_dict.txt")
    jieba.load_userdict("news_dict.txt")
    jieba.analyse.set_stop_words("stop_words.txt")
    jieba.analyse.set_stop_words("stop_words_sport.txt")

    for t_link in titles_link:

        print('get_web_page: ', t_link['title'], " ", t_link['link'])
        try:
            page = get_web_page_html(t_link['link'])
            # page = get_web_page(t_link['link'])
        except requests.exceptions.SSLError:
            continue
        except lxml.etree.ParserError:
            continue
        if page is None:
            continue
        cleaner.kill_tags = ['a', 'img']
        cleaner.remove_tags = ['div', 'p']
        cleaner.remove_unknown_tags = False
        cleaner.allow_tags = ['p']
        result = html.tostring(cleaner.clean_html(page),
                               encoding="utf-8",
                               pretty_print=True,
                               method="html")
        article_content = re.sub('&#13;', "", result.decode('utf-8'))

        #
        article_content = re.sub(u'[^\u4E00-\u9FA5]', " ", article_content)
        article_content = re.sub(r'[\n\xa0\W你妳我他她它們]', "", article_content)
        article_content = re.sub('自己', "", article_content)
        # print(article_content)
        words_t = jieba.cut(article_content, cut_all=False)
        word_t_list = [word for word in words_t if word not in remove_words]
        print(word_t_list)
        documents.append(word_t_list)
    return documents
Esempio n. 15
0
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import lxml
from lxml.html.clean import Cleaner
#import json
#import codecs
from peewee import *
import datetime

cleaner = Cleaner()
cleaner.javascript = True
cleaner.style = True
cleaner.comments = True
cleaner.allow_tags = False
cleaner.links = False
cleaner.page_structure = False

db = MySQLDatabase('scrap', user='******', passwd='')

class BaseModel(Model):
    """
    Base peewee DB model
    """

    class Meta:
        database = db


class Texts(BaseModel):
Esempio n. 16
0
    if nb_upper > nb_lower:
        return titlecase(title)
    else:
        return title


# HTML sanitizing for the title

overescaped_re = re.compile(r'&amp;#(\d+);')
unicode4_re = re.compile(r'(\\u[0-9A-Z]{4})(?![0-9A-Z])')
whitespace_re = re.compile(r'\s+')
ltgt_re = re.compile(r'.*[<>&]')

html_cleaner = Cleaner()
html_cleaner.allow_tags = ['sub', 'sup', 'b', 'span']
html_cleaner.remove_unknown_tags = False

html_killer = Cleaner()
html_killer.allow_tags = ['div']
html_killer.remove_unknown_tags = False

latexmath_re = re.compile(r'\$(\S[^$]*?\S|\S)\$')


def remove_latex_math_dollars(string):
    """
    Removes LaTeX dollar tags.

    >>> remove_latex_math_dollars(u'This is $\\\\beta$-reduction explained')
    u'This is \\\\beta-reduction explained'
Esempio n. 17
0
            nb_upper += 1
        elif title[i].islower():
            nb_lower += 1

    if nb_upper > nb_lower:
        title = titlecase(title)
    return title

## HTML sanitizing for the title

overescaped_re = re.compile(r'&amp;#(\d+);')
unicode4_re = re.compile(r'(\\u[0-9A-Z]{4})(?![0-9A-Z])')
whitespace_re = re.compile(r'\s+')

html_cleaner = Cleaner()
html_cleaner.allow_tags = ['sub','sup','b','span']
html_cleaner.remove_unknown_tags = False

html_killer = Cleaner()
html_killer.allow_tags = ['div']
html_killer.remove_unknown_tags = False

latexmath_re = re.compile(r'\$(\S[^$]*?\S|\S)\$')
def remove_latex_math_dollars(string):
    return latexmath_re.sub(r'\1', string)

latex_command_re = re.compile(r'(\\([a-zA-Z]+|[.=\'"])({[^}]*})*)')
def unescape_latex(s):
    def conditional_replace(fragment):
        rep = unicode_tex.tex_to_unicode_map.get(fragment.group(0))
        return rep if rep is not None else fragment.group(0)
if len(sys.argv) != 4:
    print "EXEC: python " + WRITE_FILE_NAME + ".py 'flag_separate_files' 'read_path' 'write_path'"
    sys.exit()

TRUTH = ["true", "TRUE", "1"]
FLAG_SEPARATE_FILES = sys.argv[1] in TRUTH
READ_PATH = sys.argv[2]
WRITE_PATH = sys.argv[3]

if not os.path.exists(WRITE_PATH):
    os.makedirs(WRITE_PATH)

cleaner = Cleaner()
cleaner.javascript = True
cleaner.style = True
cleaner.allow_tags = ['']
cleaner.remove_unknown_tags = False
i = 0
if not FLAG_SEPARATE_FILES:
    writer = open(WRITE_PATH + "/file.txt", 'w')
for path, dirs, files in os.walk(READ_PATH):
    for filename in files:
        fullpath = os.path.join(path, filename)
        with open(fullpath, 'r') as f:
            i += 1
            if FLAG_SEPARATE_FILES:
                writer = open(WRITE_PATH + "/file" + str(i) + ".txt", 'w')
            data = f.read()
            massive_clean = cleaner.clean_html(data)
            remaining_tags = re.sub('<[^>]*>', '', massive_clean)
            spaces = re.sub('\s{2,}', '', remaining_tags)
Esempio n. 19
0
		#print("cap response is " + str(the_page));
		#print(the_page['success']);
		#print("what");
		if ( the_page['success'] != True):
			return jsonify(success=False);#return empty object		
	except urllib2.URLError, e:
		return jsonify(success=False); #return empty object
		

	#so the captcha is valid. Now clean the user data
	cleaner = Cleaner()
	cleaner.javascript = True # This is True because we want to activate the javascript filter
	cleaner.style = True      # This is True because we want to activate the styles & stylesheet filter
	cleaner.scripts = True
	cleaner.links = True
	cleaner.allow_tags = None
	
	
	name = cleaner.clean_html(name)
	phone = cleaner.clean_html(phone)
	email = cleaner.clean_html(email)
	message = cleaner.clean_html(message)
	
	#build the email
	newMess = mail.EmailMessage();
	newMess.sender ="pizzaoptimization <*****@*****.**>"
	newMess.subject = escape(strip_tags("Website Contact for tutoring:  "+ name))
	newMess.to = "pizzaoptimization <*****@*****.**>"
	newMess.body = escape(strip_tags("Name: " + name + "\nemail: " + email + "\nphone: " + phone + "\nmessage: " + message))
	
	#send the email