def clean_article_html(cls, node): article_cleaner = Cleaner() article_cleaner.javascript = True article_cleaner.style = True article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'em'] article_cleaner.remove_unknown_tags = False return article_cleaner.clean_html(node)
def html2content(html, allowed_tags=["a", "abbr", "article", "aside", "b", "base", "blockquote", "body", "br", "caption", "cite", "code", "col", "colgroup", "dd", "del", "dfn", "dl", "dt", "em", "embed", "figcaption", "figure", "footer", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "i", "img", "li", "map", "mark", "math", "meta", "meter", "nav", "noscript", "object", "ol", "optgroup", "option", "output", "p", "param", "pre", "progress", "q", "rp", "rt", "ruby", "s", "samp", "section", "small", "source", "span", "strong", "sub", "sup", "svg", "table", "tbody", "td", "th", "thead", "tfoot", "time", "title", "tr", "track", "u", "ul", "var", "video", "wbr"]): cleaner = Cleaner() cleaner.allow_tags = allowed_tags cleaner.remove_unknown_tags = False cleaner.page_structure = False cleaner.meta = False cleaner.style = True cleaner.embeded = False return cleaner.clean_html(html)
def html_strict_cleaning(html, allow_tags=['p', 'br', 'a', 'img', 'div']): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.allow_tags = allow_tags cleaner.remove_unknown_tags = False return lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(html)), encoding='unicode')
def clean_article_html(cls, node): article_cleaner = Cleaner() article_cleaner.javascript = True article_cleaner.style = True article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'em', 'i', 'tt', 'code', 'pre', 'blockquote', 'img', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] article_cleaner.remove_unknown_tags = False return article_cleaner.clean_html(node)
def parse_content(content): html_img_cleaner = Cleaner(allow_tags=["img"], remove_unknown_tags=False) html_img_cleaner.allow_tags = ["img"] xml_parser = lxml.etree.XMLParser( remove_blank_text=True, ns_clean=True, encoding="utf-8" ) return lxml.etree.XML("<div>" + escape(content) + "</div>", xml_parser)
def clean_article_html(cls, node): article_cleaner = Cleaner() article_cleaner.javascript = True article_cleaner.style = True article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'ul', 'ol', 'li', 'em', 'i', 'code', 'pre', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] article_cleaner.remove_unknown_tags = False return article_cleaner.clean_html(node)
def clean(self): cleaner= Cleaner(page_structure=False) cleaner.javascript = True cleaner.scripts = True cleaner.frames = True cleaner.allow_tags = [] cleaner.remove_tags = ['p', 'div', 'a'] self.name= (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content() self.price = (lxml.html.document_fromstring(cleaner.clean_html(self.price))).text_content() self.discountcode = (lxml.html.document_fromstring(cleaner.clean_html(self.discountcode))).text_content() self.categorycode= (lxml.html.document_fromstring(cleaner.clean_html(self.categorycode))).text_content() self.orderdate= (lxml.html.document_fromstring(cleaner.clean_html(self.orderdate))).text_content() self.selldate= (lxml.html.document_fromstring(cleaner.clean_html(self.selldate))).text_content() self.page= (lxml.html.document_fromstring(cleaner.clean_html(self.page))).text_content()
def clearTag(self, text: str) -> str: import lxml from lxml.html.clean import Cleaner try: cleaner = Cleaner(remove_unknown_tags=False) cleaner.allow_tags = [""] return (lxml.html.tostring( cleaner.clean_html(lxml.html.document_fromstring(text)), encoding="unicode", ).replace("<div>", "").replace("</div>", "").replace(" ", "").replace( "'", "\\'").rstrip("\n\n")) except: return ""
def get_locations(self) -> list: locations = [] page = requests.get(self.link) tree = html.fromstring(page.text) tag = tree.xpath('//ul[@class="dropdown-menu"]')[7] c = Cleaner() c.allow_tags = 'a' c.remove_unknown_tags = False doc = etree.tostring(tag) for s in c.clean_html(doc).decode("utf-8").replace( "<div>", "").replace("</div>", "").strip().split('\n'): locations.append( Location( regexTagContent.findall(s)[0].replace("<", "").replace(">", ""), regexHref.findall(s)[0])) return locations
def get_menu(self, location: str) -> Menu: food = [] c = Cleaner() c.allow_tags = ['img'] c.remove_unknown_tags = False page = requests.get(self.link + location) tree = html.fromstring(page.text) f = tree.xpath( '//div[@style="background-color:#ecf0f1;border-radius: 4px 4px 0px 0px; padding: 8px;"]' ) doc = etree.tostring(f[0], pretty_print=True) t = c.clean_html(doc).decode("utf-8").replace("<div>", "").replace( "</div>", "") dtime = self.__extract_date( c.clean_html(doc).decode("utf-8").replace("<div>", "").replace( "</div>", "").strip().split("Essen ")) food += self.__extract_food(regexFoodBlock.findall(t)) # foodList.append(f) return Menu(dtime, food)
def clean(self: T) -> str: cleaner = Cleaner() cleaner.style = self.__style cleaner.links = self.__links cleaner.page_structure = self.__page_structure cleaner.safe_attrs_only = self.__safe_attrs_only # allow_tags and remove_unknown_tags can't work together if self.__allow_tags is not None: cleaner.remove_unknown_tags = False cleaner.allow_tags = self.__allow_tags if self.__kill_tags is not None: cleaner.kill_tags = self.__kill_tags if self.__remove_tags is not None: cleaner.remove_tags = self.__remove_tags if self.__safe_attrs is not None: cleaner.safe_attrs = self.__safe_attrs self.__input = cleaner.clean_html(self.__input) return self.__input
def main(): html = "<div>Bill Gates: Microsoft Founder and Philanthropist: Microsoft Founder and ... - Marylou Morano Kjelle - Google BooksA privacy reminder from Google<a class=\"gb_od gb_7c\" tabindex=\"0\">Review now</a><a class=\"gb_od gb_nd\" tabindex=\"0\">I'll read this later</a><a class=\"gb_b gb_5b\" href=\"https://www.google.co.uk/intl/en/options/\" title=\"Google apps\" tabindex=\"0\"></a><a class=\"gb_O\" href=\"https://myaccount.google.com/?utm_source=OGB&utm_medium=app\" id=\"gb192\">My Account</a><a class=\"gb_O\" href=\"https://www.google.co.uk/webhp?tab=pw&ei=MjliWbTaCcPq-AHcsbOYAQ&ved=0EKkuCAgoAQ\" id=\"gb1\">Search</a><a class=\"gb_O\" href=\"https://maps.google.co.uk/maps?hl=en&tab=pl\" id=\"gb8\">Maps</a><a class=\"gb_O\" href=\"https://www.youtube.com/?gl=GB\" id=\"gb36\">YouTube</a><a class=\"gb_O\" href=\"https://play.google.com/?hl=en&tab=p8\" id=\"gb78\">Play</a><a class=\"gb_O\" href=\"https://news.google.co.uk/nwshp?hl=en&tab=pn&ei=MjliWbTaCcPq-AHcsbOYAQ&ved=0EKkuCAwoBQ\" id=\"gb5\">News</a><a class=\"gb_O\" href=\"https://mail.google.com/mail/?tab=pm\" id=\"gb23\">Gmail</a><a class=\"gb_O\" href=\"https://drive.google.com/?tab=po\" id=\"gb49\">Drive</a><a class=\"gb_O\" href=\"https://www.google.com/calendar?tab=pc\" id=\"gb24\">Calendar</a><a class=\"gb_O\" href=\"https://plus.google.com/?gpsrc=ogpy0&tab=pX\" id=\"gb119\">Google+</a><a class=\"gb_O\" href=\"https://translate.google.co.uk/?hl=en&tab=pT\" id=\"gb51\">Translate</a><a class=\"gb_O\" href=\"https://photos.google.com/?tab=pq&pageId=none\" id=\"gb31\">Photos</a><a class=\"gb_ka gb_xf\" href=\"https://www.google.co.uk/intl/en/options/\">More</a><a class=\"gb_O\" href=\"http://www.google.co.uk/shopping?hl=en&tab=pf&ei=MjliWbTaCcPq-AHcsbOYAQ&ved=0EKkuCBMoDA\" id=\"gb6\">Shopping</a><a class=\"gb_O\" href=\"https://www.google.co.uk/finance?tab=pe\" id=\"gb27\">Finance</a><a class=\"gb_O\" href=\"https://docs.google.com/document/?usp=docs_alc\" id=\"gb25\">Docs</a><a class=\"gb_O\" href=\"https://books.google.co.uk/bkshp?hl=en&tab=pp&ei=MjliWbTaCcPq-AHcsbOYAQ&ved=0EKkuCBYoDw\" id=\"gb10\">Books</a><a class=\"gb_O\" href=\"https://www.blogger.com/?tab=pj\" id=\"gb30\">Blogger</a><a class=\"gb_O\" href=\"https://www.google.com/contacts/?hl=en&tab=pC\" id=\"gb53\">Contacts</a><a class=\"gb_O\" href=\"https://hangouts.google.com/\" id=\"gb300\">Hangouts</a><a class=\"gb_O\" href=\"https://keep.google.com/\" id=\"gb136\">Keep</a><a class=\"gb_la gb_sf\" href=\"https://www.google.co.uk/intl/en/options/\">Even more from Google</a><a class=\"gb_Bf gb_Ha gb_xb\" id=\"gb_70\" href=\"https://www.google.com/accounts/Login?service=print&continue=https://books.google.co.uk/books%3Fid%3Daf3PBQAAQBAJ%26pg%3DPA2%26lpg%3DPA2%26dq%3Dbillgatesmicrosoft%26source%3Dbl%26ots%3DkOoeyqnrmG%26sig%3DO8vNTHW0AmC039_nJsnKiEucONQ%26hl%3Den%26sa%3DX&hl=en\" target=\"_top\">Sign in</a><a class=\"gb_5d gb_3b\" href=\"https://books.google.co.uk/bkshp?hl=en&tab=pp\" title=\"Books\"></a>Hidden fields<a class=\"consentBumpSlowLink\" href=\"//consent.google.com/?hl=en&continue=https://books.google.co.uk/books%3Fid%3Daf3PBQAAQBAJ%26pg%3DPA2%26lpg%3DPA2%26dq%3Dbillgatesmicrosoft%26source%3Dbl%26ots%3DkOoeyqnrmG%26sig%3DO8vNTHW0AmC039_nJsnKiEucONQ%26hl%3Den%26sa%3DX%26ved%3D0ahUKEwi3oKipsPzUAhWcF8AKHSXwAc8Q6AEIkgEwFQ&pc=ogb&wp=71&l=1&if=1&fld=2&origin=https://books.google.co.uk\" target=\"_top\">Load basic HTML</a>\u00a0(for slow connections)<a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&pg=PA2&lpg=PA2&dq=billgatesmicrosoft&source=bl&ots=kOoeyqnrmG&sig=O8vNTHW0AmC039_nJsnKiEucONQ&hl=en&sa=X&output=html_text\" title=\"Screen reader users: click this link for accessible mode. Accessible mode has the same essential features but works better with your reader.\"></a><a href=\"/books\">Books</a><a id=\"appbar-write-review-link\" href=\"https://www.google.com/accounts/Login?service=print&continue=https://books.google.co.uk/books%3Fop%3Dlookup%26id%3Daf3PBQAAQBAJ%26continue%3Dhttps://books.google.co.uk/books%253Fid%253Daf3PBQAAQBAJ%2526pg%253DPA2%2526lpg%253DPA2%2526dq%253Dbillgatesmicrosoft%2526source%253Dbl%2526ots%253DkOoeyqnrmG%2526sig%253DO8vNTHW0AmC039_nJsnKiEucONQ%2526hl%253Den%2526sa%253DX&hl=en\"></a><a id=\"appbar-view-print-sample-link\" href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&printsec=frontcover&source=gbs_vpt_read\"></a><a id=\"appbar-view-ebook-sample-link\" href=\"https://play.google.com/books/reader?id=af3PBQAAQBAJ&printsec=frontcover&source=gbs_vpt_read\"></a><a id=\"appbar-patents-prior-art-finder-link\" href=\"\"></a><a id=\"appbar-patents-discuss-this-link\" href=\"\"></a><a id=\"appbar-read-patent-link\" href=\"\"></a><a id=\"appbar-download-pdf-link\" href=\"\"></a>books.google.co.uk - Learn who Bill Gates is, how Microsoft got its start, where it\u00e2\u0080\u0099s heading, and much more. Primary sources with accompanying questions, multiple prompts, timeline, index, and glossary also included. Core Library is an imprint of Abdo Publishing Company....https://books.google.co.uk/books/about/Bill_Gates_Microsoft_Founder_and_Philant.html?id=af3PBQAAQBAJ&utm_source=gb-gplus-shareBill Gates: Microsoft Founder and Philanthropist<a class=\"gbmt goog-menuitem-content\" id=\"\" href=\"https://www.google.com/accounts/Login?service=print&continue=https://books.google.co.uk/books%3Fop%3Dlibrary&hl=en\">My library</a><a class=\"gbmt goog-menuitem-content\" id=\"\" href=\"http://books.google.co.uk/support/topic/4359341?hl=en-GB\">Help</a><a class=\"gbmt goog-menuitem-content\" id=\"\" href=\"https://books.google.co.uk/advanced_book_search?q=billgatesmicrosoft\">Advanced Book Search</a><a href=\"https://play.google.com/store/books/details?id=af3PBQAAQBAJ&rdid=book-af3PBQAAQBAJ&rdot=1&source=gbs_vpt_read&pcampaignid=books_booksearch_viewport\" id=\"gb-get-book-content\">Buy eBook - \u00c2\u00a321.79</a><p id=\"gb-buy-options-trigger\" class=\"gb-buy-options-link\">Get this book in print</p><a name=\"buy_anchor\"></a><a href=\"http://abdopublishing.com/shop/show/6322\" dir=\"ltr\">ABDO</a><a href=\"http://www.amazon.co.uk/gp/search?index=books&linkCode=qs&keywords=9781629694603\" dir=\"ltr\">Amazon.co.uk</a><a href=\"http://www.bookdepository.com/book/9781629694603\" dir=\"ltr\">BookDepository</a><a href=\"http://www.waterstones.com/waterstonesweb/advancedSearch.do?buttonClicked=2&isbn=1629694606\" dir=\"ltr\">Waterstone's</a><a href=\"http://www.whsmith.co.uk/CatalogAndSearch/SearchWithinCategory.aspx?as_ISBN=1629694606\" dir=\"ltr\">WHSmith</a><a href=\"http://bookshop.blackwell.co.uk/bobuk/scripts/home.jsp?action=search&type=isbn&term=1629694606\" dir=\"ltr\">Blackwell</a><a href=\"https://books.google.co.uk/url?id=af3PBQAAQBAJ&pg=PA2&q=http://worldcat.org/isbn/1629694606&clientid=librarylink&usg=AFQjCNF30N2K8V8cLKFtxjpcHRSt5RPkwg&source=gbs_buy_r\">Find in a library</a><a class=\"secondary\" href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&sitesec=buy&source=gbs_buy_r\" id=\"get-all-sellers-link\">All sellers\u00a0\u00bb</a><a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&printsec=frontcover\"></a><a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&dq=billgatesmicrosoft&sitesec=reviews\"></a> <a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&dq=billgatesmicrosoft&sitesec=reviews\" class=\"sbs-count secondary\">0 Reviews</a><a id=\"write-review-link\" href=\"https://www.google.com/accounts/Login?service=print&continue=https://books.google.co.uk/books%3Fop%3Dlookup%26id%3Daf3PBQAAQBAJ%26continue%3Dhttps://books.google.co.uk/books%253Fid%253Daf3PBQAAQBAJ%2526pg%253DPA2%2526lpg%253DPA2%2526dq%253Dbillgatesmicrosoft%2526source%253Dbl%2526ots%253DkOoeyqnrmG%2526sig%253DO8vNTHW0AmC039_nJsnKiEucONQ%2526hl%253Den%2526sa%253DX&hl=en\" class=\"secondary sbs-link\">Write review</a>https://books.google.com/books/about/Bill_Gates_Microsoft_Founder_and_Philant.html?id=af3PBQAAQBAJBill Gates: Microsoft Founder and Philanthropist: Microsoft Founder and ...By Marylou Morano Kjelle \u00a0<p><a id=\"sidebar-atb-link\" href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&dq=billgatesmicrosoft&source=gbs_navlinks_s\">About this book</a></p><a href=\"/intl/en/googlebooks/tos.html\" target=\"_blank\">Terms\u00a0of\u00a0Service</a><a name=\"pub_info_anchor\"></a><a href=\"https://books.google.co.uk/url?id=af3PBQAAQBAJ&pg=PA2&q=http://www.abdopublishing.com/index.html&linkid=1&usg=AFQjCNEc5mprg92y9GPuvfY6s-WXez4caQ&source=gbs_pub_info_r\"></a>Pages displayed by permission of <a class=\"link_aux\" href=\"https://books.google.co.uk/url?id=af3PBQAAQBAJ&pg=PA2&q=http://www.abdopublishing.com/index.html&linkid=1&usg=AFQjCNEc5mprg92y9GPuvfY6s-WXez4caQ&source=gbs_pub_info_r\">ABDO</a>.\u00a0<a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&printsec=copyright&source=gbs_pub_info_r\">Copyright</a>.\u00a0Page 2<a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&pg=PA1&lpg=PA2&ots=kOoeyqnrmG&focus=viewport&dq=billgatesmicrosoft\"></a><a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&pg=PA3&lpg=PA2&ots=kOoeyqnrmG&focus=viewport&dq=billgatesmicrosoft\"></a>\u00a0\u00a0<a name=\"page\" accesskey=\"c\"></a><a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&pg=PA3&lpg=PA2&ots=kOoeyqnrmG&focus=viewport&dq=billgatesmicrosoft\"></a></div>" #print('Original HTML: ' + html[0:80]) c = Cleaner( scripts=True, javascript=True, comments=True, style=True, links=True, meta=True, page_structure=True, processing_instructions=True, embedded=True, frames=True, forms=True, annoying_tags=True, ) c.allow_tags = None c.remove_unknown_tags = True html = c.clean_html(html) print('Cleaned up HTML: ' + str(html))
def html2content( html, allowed_tags=[ "a", "abbr", "article", "aside", "b", "base", "blockquote", "body", "br", "caption", "cite", "code", "col", "colgroup", "dd", "del", "dfn", "dl", "dt", "em", "embed", "figcaption", "figure", "footer", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "i", "img", "li", "map", "mark", "math", "meta", "meter", "nav", "noscript", "object", "ol", "optgroup", "option", "output", "p", "param", "pre", "progress", "q", "rp", "rt", "ruby", "s", "samp", "section", "small", "source", "span", "strong", "sub", "sup", "svg", "table", "tbody", "td", "th", "thead", "tfoot", "time", "title", "tr", "track", "u", "ul", "var", "video", "wbr" ]): cleaner = Cleaner() cleaner.allow_tags = allowed_tags cleaner.remove_unknown_tags = False cleaner.page_structure = False cleaner.meta = False cleaner.style = True cleaner.embeded = False return cleaner.clean_html(html)
def google_news_cut(link): cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter page = get_web_page(link) soup = BeautifulSoup(page, 'html.parser') # all_news = soup.find_all('a', 'nuEeue hzdq5d ME7ew') all_news = soup.find_all('a', 'ipQwMb Q7tWef') key_str = "" titles_link = [] word_t_list = [] documents = [] for news in all_news: # print(news.string) # print(news['href']) if re.match('\./', news['href']) is None: link = news['href'] else: link = 'https://news.google.com/' + re.sub('\./', "", news['href']) titles_link.append({'title': news.string, 'link': link}) key_str = key_str + news.string + "\n" remove_words = [ 'mlb', 'nba', '新聞網', '中央社', '報紙', '聯合', '時報', '全網', '自己', '中時', '年月日', '直播', '三立', '聞網', '使用者', '中國時報', '自由時報', '關鍵字', '網站', '發表', '留言', '發言', '網小時', '自由' ] jieba.load_userdict("my_dict.txt") jieba.load_userdict("news_dict.txt") jieba.analyse.set_stop_words("stop_words.txt") jieba.analyse.set_stop_words("stop_words_sport.txt") for t_link in titles_link: print('get_web_page: ', t_link['title'], " ", t_link['link']) try: page = get_web_page_html(t_link['link']) # page = get_web_page(t_link['link']) except requests.exceptions.SSLError: continue except lxml.etree.ParserError: continue if page is None: continue cleaner.kill_tags = ['a', 'img'] cleaner.remove_tags = ['div', 'p'] cleaner.remove_unknown_tags = False cleaner.allow_tags = ['p'] result = html.tostring(cleaner.clean_html(page), encoding="utf-8", pretty_print=True, method="html") article_content = re.sub(' ', "", result.decode('utf-8')) # article_content = re.sub(u'[^\u4E00-\u9FA5]', " ", article_content) article_content = re.sub(r'[\n\xa0\W你妳我他她它們]', "", article_content) article_content = re.sub('自己', "", article_content) # print(article_content) words_t = jieba.cut(article_content, cut_all=False) word_t_list = [word for word in words_t if word not in remove_words] print(word_t_list) documents.append(word_t_list) return documents
# # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import lxml from lxml.html.clean import Cleaner #import json #import codecs from peewee import * import datetime cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.comments = True cleaner.allow_tags = False cleaner.links = False cleaner.page_structure = False db = MySQLDatabase('scrap', user='******', passwd='') class BaseModel(Model): """ Base peewee DB model """ class Meta: database = db class Texts(BaseModel):
if nb_upper > nb_lower: return titlecase(title) else: return title # HTML sanitizing for the title overescaped_re = re.compile(r'&#(\d+);') unicode4_re = re.compile(r'(\\u[0-9A-Z]{4})(?![0-9A-Z])') whitespace_re = re.compile(r'\s+') ltgt_re = re.compile(r'.*[<>&]') html_cleaner = Cleaner() html_cleaner.allow_tags = ['sub', 'sup', 'b', 'span'] html_cleaner.remove_unknown_tags = False html_killer = Cleaner() html_killer.allow_tags = ['div'] html_killer.remove_unknown_tags = False latexmath_re = re.compile(r'\$(\S[^$]*?\S|\S)\$') def remove_latex_math_dollars(string): """ Removes LaTeX dollar tags. >>> remove_latex_math_dollars(u'This is $\\\\beta$-reduction explained') u'This is \\\\beta-reduction explained'
nb_upper += 1 elif title[i].islower(): nb_lower += 1 if nb_upper > nb_lower: title = titlecase(title) return title ## HTML sanitizing for the title overescaped_re = re.compile(r'&#(\d+);') unicode4_re = re.compile(r'(\\u[0-9A-Z]{4})(?![0-9A-Z])') whitespace_re = re.compile(r'\s+') html_cleaner = Cleaner() html_cleaner.allow_tags = ['sub','sup','b','span'] html_cleaner.remove_unknown_tags = False html_killer = Cleaner() html_killer.allow_tags = ['div'] html_killer.remove_unknown_tags = False latexmath_re = re.compile(r'\$(\S[^$]*?\S|\S)\$') def remove_latex_math_dollars(string): return latexmath_re.sub(r'\1', string) latex_command_re = re.compile(r'(\\([a-zA-Z]+|[.=\'"])({[^}]*})*)') def unescape_latex(s): def conditional_replace(fragment): rep = unicode_tex.tex_to_unicode_map.get(fragment.group(0)) return rep if rep is not None else fragment.group(0)
if len(sys.argv) != 4: print "EXEC: python " + WRITE_FILE_NAME + ".py 'flag_separate_files' 'read_path' 'write_path'" sys.exit() TRUTH = ["true", "TRUE", "1"] FLAG_SEPARATE_FILES = sys.argv[1] in TRUTH READ_PATH = sys.argv[2] WRITE_PATH = sys.argv[3] if not os.path.exists(WRITE_PATH): os.makedirs(WRITE_PATH) cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.allow_tags = [''] cleaner.remove_unknown_tags = False i = 0 if not FLAG_SEPARATE_FILES: writer = open(WRITE_PATH + "/file.txt", 'w') for path, dirs, files in os.walk(READ_PATH): for filename in files: fullpath = os.path.join(path, filename) with open(fullpath, 'r') as f: i += 1 if FLAG_SEPARATE_FILES: writer = open(WRITE_PATH + "/file" + str(i) + ".txt", 'w') data = f.read() massive_clean = cleaner.clean_html(data) remaining_tags = re.sub('<[^>]*>', '', massive_clean) spaces = re.sub('\s{2,}', '', remaining_tags)
#print("cap response is " + str(the_page)); #print(the_page['success']); #print("what"); if ( the_page['success'] != True): return jsonify(success=False);#return empty object except urllib2.URLError, e: return jsonify(success=False); #return empty object #so the captcha is valid. Now clean the user data cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.scripts = True cleaner.links = True cleaner.allow_tags = None name = cleaner.clean_html(name) phone = cleaner.clean_html(phone) email = cleaner.clean_html(email) message = cleaner.clean_html(message) #build the email newMess = mail.EmailMessage(); newMess.sender ="pizzaoptimization <*****@*****.**>" newMess.subject = escape(strip_tags("Website Contact for tutoring: "+ name)) newMess.to = "pizzaoptimization <*****@*****.**>" newMess.body = escape(strip_tags("Name: " + name + "\nemail: " + email + "\nphone: " + phone + "\nmessage: " + message)) #send the email