def visit(url): if url.startswith(base_url) == False: return try: resp = urlopen(url) except URLError as e: return page = resp.read() cleaner = Cleaner() cleaner.javasript = True cleaner.style = True cleaner.kill_tags = ELEMENTS_TO_IGNORE # soup = BeautifulSoup(page, "lxml") # for link in soup.findAll('a'): # if link.has_attr('href'): # if link.has_attr('class') and 'history' in link['class']: # continue # next_link = urljoin(url,link['href']) # next_link = urldefrag(next_link)[0] # if next_link not in visited_pages: # visited_pages.append(next_link) # pages_to_visit.append(next_link) f = open("testing.txt", 'w') f.write(page) clean_page = cleaner.clean_html(page) f.write("\n\n\nVS\n\n\n") f.write(clean_page) f.close() soup = BeautifulSoup(clean_page, "lxml") return extract(soup, url)
def visit(url): if url.startswith(base_url) == False: return try: resp = urlopen(url) except URLError as e: return page = resp.read() cleaner = Cleaner() cleaner.javasript = True cleaner.style = True cleaner.kill_tags = ELEMENTS_TO_IGNORE # soup = BeautifulSoup(page, "lxml") # for link in soup.findAll('a'): # if link.has_attr('href'): # if link.has_attr('class') and 'history' in link['class']: # continue # next_link = urljoin(url,link['href']) # next_link = urldefrag(next_link)[0] # if next_link not in visited_pages: # visited_pages.append(next_link) # pages_to_visit.append(next_link) clean_page = cleaner.clean_html(page) soup = BeautifulSoup(clean_page, "lxml") extract(soup, url)
def extract_text(self, url): try: if url.value.startswith('http') and '://' in url.value: prog = FloatProgress(min=0, max=100, description='Progress') display(widgets.HTML('<br/>'), prog) tr0 = time() site = self.browser.get(url.value, timeout=10) if site.ok: prog.value += 50 tr1 = time() - tr0 t0 = time() cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.kill_tags = ['header', 'footer'] source_tree = etree.HTML(cleaner.clean_html(site.content)) text = source_tree.itertext() t1 = time() - t0 self.text = '\n'.join( [n.strip() for n in text if n.strip()]) prog.value += 50 self.keywords_and_display(prog) else: display( widgets.HTML( '<div style="font-size: 1.5em; margin-top:1em; margin-bottom:1em">404 - bad URL</div>' )) else: self.text = url.value self.keywords_and_display(False) except Exception as e: print 'Error extracting text: %s' % (e)
def cleaner_parameters(): reject_list = [ 'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label', 'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math' ] accept_list = [ 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' 'span', 'b', 'a', 'u', 'i', 'body' ] html_cleaner = Cleaner() html_cleaner.remove_unknown_tags = True html_cleaner.processing_instructions = True html_cleaner.style = True html_cleaner.comments = True html_cleaner.scripts = True html_cleaner.javascript = True html_cleaner.meta = True html_cleaner.links = True html_cleaner.embedded = True html_cleaner.annoying_tags = True html_cleaner.frames = True html_cleaner.forms = True html_cleaner.remove_tags = accept_list html_cleaner.kill_tags = reject_list return html_cleaner
def cleanpage(html): # cleaner setup cleaner = Cleaner() cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False cleaner.javascript = True # activate the javascript filter cleaner.style = True # activate the styles & stylesheet filter cleaner.links = False cleaner.frames = True cleaner.embedded = True cleaner.comments = True cleaner.annoying_tags = True cleaner.inline_style = True cleaner.page_structure = False # cleaner.remove_tags = ['b','img','h'] cleaner.kill_tags = ['img','script'] #invoke cleaner try: content=cleaner.clean_html(html) except: #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr content = u"" return content
def remove_script_and_style(html_content): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.kill_tags = ['script'] clean_html = cleaner.clean_html(html_content) return clean_html
def create_plaintext_message(message): """ Create clean plain text version of email message Parse the html and remove style and javacript tags and then create a plain-text-message by parsing the html and attaching links as endnotes """ cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.kill_tags = ['style'] doc = message.decode('utf-8', 'ignore') to_clean = lxml.html.fromstring(doc) cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean)) plain_text_maxcols = 72 textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter( formatter.DumbWriter(textout, plain_text_maxcols)) parser = HTMLParser(formtext) parser.feed(cleaned_msg) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. counter = 0 anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for item in parser.anchorlist: counter += 1 if item.startswith('https://'): new_item = item.replace('https://', 'http://') else: new_item = item anchorlist += "[%d] %s\n" % (counter, new_item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist return text
def lxml_extractor(html, url): '''LXML PARSER''' cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.comments = True cleaner.embedded = True cleaner.forms= True cleaner.frames = True cleaner.annoying_tags = True cleaner.kill_tags = NEGATIVE_K cleaner.allow_tag = POSITIVE_K cleaner.safe_attrs_only = True #~ oc = document_fromstring(html, parser=parser, base_url=base_url, **kw) #~ File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 752, in document_fromstring #~ value = etree.fromstring(html, parser, **kw) try: html = lxml.html.fromstring(html, base_url="url") tree = cleaner.clean_html(html) #tree.make_links_absolute(url) doc = lxml.html.tostring(tree) doc = soup_extractor(doc, url) except ValueError: doc = soup_extractor(html, url) #~ (title, doc, article, text) = read_extractor(html, url) #~ print title #~ doc = (self.doc).replace(unichr(160), " ") #~ doc = re.sub(spaces,"",self.doc) return doc
def init_cleaner(): from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = False cleaner.style = False cleaner.kill_tags = ["pre", "code"] return cleaner
def clearTag_old(self, text: str) -> str: import lxml from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.links = True cleaner.meta = True cleaner.forms = True cleaner.embedded = True cleaner.frames = True cleaner.remove_unknown_tags = True cleaner.kill_tags = ["img"] cleaner.remove_tags = [ "strong", "div", "body", "br", "a", "p", "blockquote", "h3", "ol", "li", "font", ] return cleaner.clean_html( lxml.html.document_fromstring(text)).decode("utf-8")
def create_plaintext_message(message): """ Create clean plain text version of email message Parse the html and remove style and javacript tags and then create a plain-text-message by parsing the html and attaching links as endnotes """ cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.kill_tags = ['style'] doc = message.decode('utf-8', 'ignore') to_clean = lxml.html.fromstring(doc) cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean)) plain_text_maxcols = 72 textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter(formatter.DumbWriter( textout, plain_text_maxcols)) parser = HTMLParser(formtext) parser.feed(cleaned_msg) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. counter = 0 anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for item in parser.anchorlist: counter += 1 if item.startswith('https://'): new_item = item.replace('https://', 'http://') else: new_item = item anchorlist += "[%d] %s\n" % (counter, new_item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist return text
def get_cleaner(): cleaner = Cleaner() cleaner.embedded = True cleaner.frames = True cleaner.style = True cleaner.remove_unknown_tags = True cleaner.processing_instructions = True cleaner.annoying_tags = True cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'b', 'a', 'u', 'i', 'body', 'div', 'span', 'p'] cleaner.kill_tags = ['table', 'img', 'semantics', 'script', 'noscript', 'style', 'meta', 'label', 'li', 'ul', 'ol', 'sup', 'math', 'nav', 'dl', 'dd', 'sub'] return cleaner
def remove_tags(html_str, tags): content_hash = md5(html_str.encode('utf-8')).hexdigest() wrapper_class = f'remove-tags-wrapper-{content_hash}' html_str = f'<div class="{wrapper_class}">{html_str}</div>' tree = html.document_fromstring(html_str) cleaner = Cleaner() cleaner.kill_tags = tags.split() tree = cleaner.clean_html(tree) tree = tree.find_class(wrapper_class)[0] return mark_safe(html.tostring(tree).decode('utf-8'))
def clean_file(file): cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.kill_tags = ['head', 'img', 'iframe', 'nav', 'svg', 'figure', 'map'] file = cleaner.clean_html(file) file = file.split() file = " ".join(file) # print(file) return file
def get_text(session, url, title, dir): r = session.get(url, stream=True) doc = lxml.html.fromstring(r.text) sidebar = doc.find_class('course-sidebar')[0] sidebar.getparent().remove(sidebar) cleaner = Cleaner() cleaner.javascript = True cleaner.meta = True cleaner.kill_tags = ['header'] cleantext = lxml.html.tostring(cleaner.clean_html(doc)) filename = os.path.join(dir, title + '.html') with open(filename, 'w') as fout: print 'Downloading [T] ' + title + ' ...' fout.write(cleantext)
def clean(self: T) -> str: cleaner = Cleaner() cleaner.style = self.__style cleaner.links = self.__links cleaner.page_structure = self.__page_structure cleaner.safe_attrs_only = self.__safe_attrs_only # allow_tags and remove_unknown_tags can't work together if self.__allow_tags is not None: cleaner.remove_unknown_tags = False cleaner.allow_tags = self.__allow_tags if self.__kill_tags is not None: cleaner.kill_tags = self.__kill_tags if self.__remove_tags is not None: cleaner.remove_tags = self.__remove_tags if self.__safe_attrs is not None: cleaner.safe_attrs = self.__safe_attrs self.__input = cleaner.clean_html(self.__input) return self.__input
def get_url(self): """Get the relevant part of a web page.""" get_url = requests.get(self.data_path) page_data = get_url.content cleaner = Cleaner() cleaner.javascript = True # Remove JavaScript code from HTML. cleaner.scripts = True # Remove other code from HTML. cleaner.style = True # Remove CSS and styles from HTML. cleaner.links = True # Remove Links from HTML. cleaner.kill_tags = ['a', 'img'] # Remove these tags. # Store the cleaned up HTML. page_html = cleaner.clean_html(page_data) # Strip tags from final results. strip_tags = TagStripper() # Instantiate the HTML Tag Stripper. strip_tags.feed(page_html) # Strip all HTML tags. return strip_tags.get_html_data()
def extract_content(bytehtml, doc): """ extracts blog post content from html """ lxmldoc = lxml.html.document_fromstring(bytehtml) cleaner = Cleaner() cleaner.scripts = True cleaner.comments = True cleaner.style = True #cleaner.page_structure = True cleaner.kill_tags = ['head', 'noscript'] cleaner.remove_tags = ['p', 'i', 'b', 'strong', 'em', 'blockquote'] cleaner(lxmldoc) content_el = find_content_element(lxmldoc) if content_el: debug(3, 'content quality {}'.format(content_el._quality)) text = tidy_content(content_el.text_content()) return text else: debug(2, 'no content found!') raise Exception('no content')
def get_url(self): """Get the HTML body of a web page.""" # Create file-like object. outfile = StringIO.StringIO() cleaner = Cleaner() cleaner.javascript = True # Remove JavaScript code from HTML. cleaner.scripts = True # Remove other code from HTML. cleaner.style = True # Remove CSS and styles from HTML. cleaner.links = True # Remove Links from HTML. cleaner.kill_tags = ['a', 'img', 'li'] # Remove these tags. # Store the cleaned up HTML. page_html = lxml.html.tostring( cleaner.clean_html( lxml.html.parse(self.data_path) ) ) outfile.write(page_html) # Write the results to this file in memory. return outfile
import nltk import codecs import sys from bs4 import BeautifulSoup import lxml from lxml.html.clean import Cleaner import re from cStringIO import StringIO import unicodedata reload(sys) sys.setdefaultencoding('utf8') cleaner = Cleaner() cleaner.script = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.kill_tags = ['a', 'img', 'href'] cleaner.remove_tags = ['div', 'span', 'li'] directory1 = "C:\Users\Satanu\html_test\\" directory2 = "C:\Users\Satanu\text\\" for filename in os.listdir(directory1): to_write = [] html = codecs.open(directory1 + filename, 'r', 'utf-8') raw = lxml.html.tostring( cleaner.clean_html(lxml.html.parse(directory1 + filename))) name = filename.strip('html') text = codecs.open(directory2 + filename, 'w', 'utf-8') text.write(raw)
HTML_CLEANER.javascript = True HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = True HTML_CLEANER.style = False HTML_CLEANER.remove_tags = [ 'a', 'abbr', 'acronym', 'address', 'big', 'cite', 'font', 'ins', 'meta', 'small', 'sub', 'sup', 'wbr' ] # 'center', 'table', 'tbody', 'td', 'th', 'tr', 'span', HTML_CLEANER.kill_tags = [ 'aside', 'audio', 'canvas', 'embed', 'figure', 'footer', 'form', 'head', 'iframe', 'img', 'label', 'link', 'map', 'math', 'nav', 'noscript', 'object', 'picture', 'style', 'svg', 'time', 'video' ] # 'area', 'table' # 'header' # validation TEI_VALID_TAGS = set( ['code', 'del', 'div', 'head', 'hi', 'item', 'lb', 'list', 'p', 'quote']) TEI_VALID_ATTRS = set(['rendition']) # counters tokens_posts = 0 tokens_comments = 0 lrutest = LRU(LRU_SIZE) # justext JUSTEXT_STOPLIST = justext.get_stoplist('German')
cleaner = Cleaner() cleaner.comments = True cleaner.embedded = True cleaner.forms = False cleaner.frames = True cleaner.javascript = False cleaner.links = False cleaner.meta = False cleaner.page_structure = True cleaner.processing_instructions = True cleaner.remove_unknown_tags = False cleaner.safe_attrs_only = False cleaner.scripts = False cleaner.style = False cleaner.kill_tags = [ 'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'table', 'svg', 'video' ] # 'embed', 'figure', 'img', def date_validator(datestring, outputformat): """Validate a string with respect to the chosen outputformat and basic heuristics""" # try if date can be parsed using chosen outputformat try: dateobject = datetime.datetime.strptime(datestring, outputformat) except ValueError: return False # basic year validation year = int(datetime.date.strftime(dateobject, '%Y')) if MIN_YEAR <= year <= MAX_YEAR: # not newer than today
CLEANER = Cleaner() CLEANER.comments = False CLEANER.embedded = True CLEANER.forms = False CLEANER.frames = True CLEANER.javascript = True CLEANER.links = False CLEANER.meta = False CLEANER.page_structure = True CLEANER.processing_instructions = True CLEANER.remove_unknown_tags = False CLEANER.safe_attrs_only = False CLEANER.scripts = False CLEANER.style = True CLEANER.kill_tags = [ 'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf', 'svg', 'video' ] # 'embed', 'figure', 'img', 'table' ## REGEX cache JSON_PATTERN = re.compile( r'"date(?:Modified|Published)":"([0-9]{4}-[0-9]{2}-[0-9]{2})') # use of regex module for speed GERMAN_PATTERN = regex.compile( r'(?:Datum|Stand): ?([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{2,4})') TIMESTAMP_PATTERN = regex.compile( r'([0-9]{4}-[0-9]{2}-[0-9]{2}|[0-9]{2}\.[0-9]{2}\.[0-9]{4}).[0-9]{2}:[0-9]{2}:[0-9]{2}' ) #@profile def examine_date_elements(tree, expression, outputformat, extensive_search,
) ) html_out.getroottree().write(file="summarized-roanoke.html", method="html") if __name__ == "__main__": cleaner = Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.frame = True cleaner.meta = True cleaner.comments = True cleaner.links = True cleaner.style = True cleaner.kill_tags = ["cite", "sup", "img", "noscript", "label", "video"] url = "https://en.wikipedia.org/wiki/Roanoke_Colony" doc = urllib2.urlopen(url) tree = lxml.html.parse(doc) title = tree.find(".//title").text tree = cleaner.clean_html(tree) netloc = urlparse(url).netloc if netloc == "en.wikipedia.org": parse_wiki(tree, title) elif netloc == "cnn.com": parse_cnn(tree, title) else:
receivers = [(name.strip(), mail.strip()) for name, mail in rows if name and mail] # Load template with open(config.template_path) as f: template = f.read() # Inline styles import premailer template = premailer.transform(template) # Clean HTML import lxml.html from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.kill_tags = ['style', 'script'] page = cleaner.clean_html(lxml.html.fromstring(template)) assert not page.xpath('//style'), 'style' assert not page.xpath('//script'), 'script' template = lxml.html.tostring(page).decode('utf-8') # Send mails sender = Mailer('smtp.yandex.com.tr', port='465', use_ssl=True) sender.login(config.user_mail, getpass('Password: '******'start') for receiver_name, receiver_mail in receivers: try: message = Message(From=config.user_mail, To=receiver_mail, charset="utf-8") attachment_path = glob(
import lxml from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.kill_tags = ['head', 'script', 'header', 'href', 'footer'] print (lxml.html.tostring(cleaner.clean_html(lxml.html.parse('/home/caiocesare/PycharmProjects/script/1.html'))))
from bs4 import BeautifulSoup from lxml.html.clean import Cleaner from lxml.etree import XMLSyntaxError from store_helper import StoreHelper from text_helper import TextHelper cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.inline_style = True cleaner.whitelist_tags = set([]) cleaner.remove_tags = [ 'p', 'ul', 'li', 'b', 'br', 'article', 'div', 'body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'span' ] cleaner.kill_tags = ['footer', 'a', 'noscript', 'header', 'label'] class HTMLHelper(object): @staticmethod def remove_tag(web_source): text = re.sub(r'<[^>]+>', '', web_source) return text @staticmethod def get_text(web_source): try: _html = lxml.html.document_fromstring(web_source) except XMLSyntaxError: print("Exception when convert web source to html document") return web_source
parser.add_argument("-f","--file", help="metalink article name",required=True) args = parser.parse_args() filename = args.file # file check if not os.path.isfile(filename) and not os.access(sys.argv[1], os.R_OK): print "WARNING - Couldn't find specified file!" sys.exit(1) elif not os.path.exists('original'): print 'Creating original directory for backups...' os.makedirs('original') # cleaner cleaner = Cleaner(page_structure=False) cleaner.remove_tags = ["span"] cleaner.kill_tags = ["script","img","style"] # original file conversion original = codecs.open(filename,"r","cp866") for line in original: line = re.sub(r"[^\x00-\x7F]+","",line) #if " " in line: #line = re.sub(r" ", "", line) if "®" in line: line = line.replace("®","") number = re.search(r"<span style=\"display:none\">\d+</span>", line) if number: line = re.sub(r"<span style=\"display:none\">\d+</span>", "", line) footer = re.search(r"Didn't find what you are looking for\?", line) if footer:
def get_context_data(self, **kwargs): headers = cache.get(self.object.id, version="email-header") if headers is None: headers = models.Header.objects.filter(part__email=self.object, part__parent=None) headers = headers.get_many("Subject", "From") email_dict = {} email_dict["subject"] = headers.get("Subject", "(No subject)") email_dict["from"] = headers["From"] email_dict["date"] = self.object.received_date email_dict["inbox"] = self.object.inbox email_dict["eid"] = self.object.eid # iterate over MIME parts html = None plain = None attachments = [] for part in self.object.parts.all(): part_head = part.header_set.get_many("Content-Type", "Content-Disposition") part_head["content_type"] = part_head.pop("Content-Type", "").split(";", 1) dispos = part_head.pop("Content-Disposition", "") if part_head["content_type"][0].startswith("multipart") or part_head["content_type"][0].startswith( "message" ): continue try: params = dict(HEADER_PARAMS.findall(part_head["content_type"][1])) except IndexError: params = {} params.update(dict(HEADER_PARAMS.findall(dispos))) # find filename, could be anywhere if "filename" in params: part_head["filename"] = params["filename"] elif "name" in params: part_head["filename"] = params["name"] else: part_head["filename"] = "" # grab charset part.charset = params.get("charset", "utf-8") if html is None and part_head["content_type"][0] == "text/html": html = part elif plain is None and part_head["content_type"][0] == "text/plain": plain = part attachments.append((part, part_head)) # set raw body plain_message = self.find_body(html, plain) if plain_message is None: if len(attachments) == 1: email_dict["body"] = str(attachments[0][0].body.data) email_dict["charset"] = attachments[0][0].charset else: email_dict["body"] = "" email_dict["charset"] = "utf-8" plain_message = True elif plain_message: email_dict["body"] = str(plain.body.data) email_dict["charset"] = plain.charset else: email_dict["body"] = str(html.body.data) email_dict["charset"] = html.charset if not plain_message: # Mail Pile uses this, give back if you come up with something better cleaner = Cleaner( page_structure=True, meta=True, links=True, javascript=True, scripts=True, frames=True, embedded=True, safe_attrs_only=True, ) cleaner.kill_tags = ["style", "base"] # remove style tags, not attrs try: email_dict["body"] = Premailer(email_dict["body"]).transform() except Exception: # Yeah, a pretty wide catch, but Premailer likes to throw up everything and anything messages.warning( self.request, _("Part of this message could not be parsed - it may not display correctly") ) try: email_dict["body"] = cleaner.clean_html(email_dict["body"]) except (etree.LxmlError, ValueError): if plain is not None and len(plain.body.data) > 0: email_dict["body"] = str(plain.body.data) email_dict["charset"] = plain.charset else: email_dict["body"] = "" email_dict["charset"] = "utf-8" plain_message = True messages.error(self.request, _("This email contained invalid HTML and could not be displayed")) self.headline = email_dict["subject"] # GET params for users with `ask_image` set in their profile if plain_message: # bypass image scrubber img_display = True ask_images = False elif "imgDisplay" in self.request.GET and int(self.request.GET["imgDisplay"]) == 1: img_display = True ask_images = False elif self.request.user.userprofile.flags.ask_images: img_display = False ask_images = True else: img_display = self.request.user.userprofile.flags.display_images ask_images = False # filter images if we need to if not img_display: try: tree = lxml_html.fromstring(email_dict["body"]) for img in tree.findall(".//img"): try: del img.attrib["src"] except KeyError: pass email_dict["body"] = etree.tostring(tree) except (etree.LxmlError, ValueError): if plain is not None and len(plain.body.data) > 0: email_dict["body"] = str(plain.body.data) email_dict["charset"] = plain.charset else: email_dict["body"] = "" email_dict["charset"] = "utf-8" # convert to unicode as late as possible email_dict["body"] = unicode(email_dict["body"], email_dict["charset"], errors="replace") context = super(EmailView, self).get_context_data(**kwargs) context.update( {"email": email_dict, "plain_message": plain_message, "attachments": attachments, "ask_images": ask_images} ) return context
HTML_CLEANER.annoying_tags = False # True HTML_CLEANER.comments = True HTML_CLEANER.embedded = False # True HTML_CLEANER.forms = False # True HTML_CLEANER.frames = False # True HTML_CLEANER.javascript = False HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = False HTML_CLEANER.style = False HTML_CLEANER.remove_tags = MANUALLY_STRIPPED HTML_CLEANER.kill_tags = MANUALLY_CLEANED def tree_cleaning(tree, include_tables, include_images=False): '''Prune the tree by discarding unwanted elements''' if include_tables is False: MANUALLY_CLEANED.append('table') if include_images is False: # Many websites have <img> inside <figure> or <picture> or <source> tag MANUALLY_CLEANED.extend(['figure', 'picture', 'source']) MANUALLY_STRIPPED.append('img') for expression in MANUALLY_CLEANED: for element in tree.getiterator(expression): try: element.drop_tree() except AttributeError:
telefono = "".join(links[1].text_content().split()) fax = "".join(links[2].text_content().split()) if len(links[3].cssselect("a")[0].attrib['href'])> len('http://'): web = links[3].cssselect("a")[0].attrib['href'] else: web = "" return direccion, telefono, fax, web cleaner = Cleaner() cleaner.kill_tags = ['strong'] for i in range(1,45): base_url = 'http://planetafan.com/cas/site/tiendas.asp?prov=0&loc=0&pag='+str(i) html = scraperwiki.scrape(base_url) root = lxml.html.fromstring(html) links = root.cssselect("ul#listado-productos li") for link in links: record = {} name = link.cssselect("a")[0].text_content() card_link = link.cssselect("a")[0].attrib['href'] address = link.cssselect("p")[0].text_content()
# earliest possible year to take into account (inclusive) MIN_DATE = datetime.date(1995, 1, 1) MIN_YEAR = MIN_DATE.year # latest possible date LATEST_POSSIBLE = datetime.date.today() # latest possible year MAX_YEAR = LATEST_POSSIBLE.year # set an upper limit to the number of candidates MAX_POSSIBLE_CANDIDATES = 150 # HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html HTML_CLEANER = Cleaner() HTML_CLEANER.comments = False HTML_CLEANER.embedded = True HTML_CLEANER.forms = False HTML_CLEANER.frames = True HTML_CLEANER.javascript = True HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = True HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = False HTML_CLEANER.style = True HTML_CLEANER.kill_tags = [ 'applet', 'audio', 'canvas', 'datalist', 'embed', 'figure', 'label', 'map', 'math', 'object', 'picture', 'rdf', 'svg', 'video' ]
def get_context_data(self, **kwargs): headers = cache.get(self.object.id, version="email-header") if headers is None: headers = models.Header.objects.filter(part__email=self.object, part__parent=None) headers = headers.get_many("Subject", "From") email_dict = {} email_dict["subject"] = headers.get("Subject", '(No subject)') email_dict["from"] = headers["From"] email_dict["date"] = self.object.received_date email_dict["inbox"] = self.object.inbox email_dict["eid"] = self.object.eid # iterate over MIME parts html = None plain = None attachments = [] for part in self.object.parts.all(): part_head = part.header_set.get_many("Content-Type", "Content-Disposition") part_head["content_type"] = part_head.pop("Content-Type", "").split(";", 1) dispos = part_head.pop("Content-Disposition", "") if part_head["content_type"][0].startswith("multipart") or part_head["content_type"][0].startswith("message"): continue try: params = dict(HEADER_PARAMS.findall(part_head["content_type"][1])) except IndexError: params = {} params.update(dict(HEADER_PARAMS.findall(dispos))) # find filename, could be anywhere if "filename" in params: part_head["filename"] = params["filename"] elif "name" in params: part_head["filename"] = params["name"] else: part_head["filename"] = "" # grab charset part.charset = params.get("charset", "utf-8") if html is None and part_head["content_type"][0] == "text/html": html = part elif plain is None and part_head["content_type"][0] == "text/plain": plain = part attachments.append((part, part_head)) # set raw body plain_message = self.find_body(html, plain) if plain_message is None: if len(attachments) == 1: email_dict["body"] = str(attachments[0][0].body.data) email_dict["charset"] = attachments[0][0].charset else: email_dict["body"] = "" email_dict["charset"] = "utf-8" plain_message = True elif plain_message: email_dict["body"] = str(plain.body.data) email_dict["charset"] = plain.charset else: email_dict["body"] = str(html.body.data) email_dict["charset"] = html.charset if not plain_message: # Mail Pile uses this, give back if you come up with something better cleaner = Cleaner(page_structure=True, meta=True, links=True, javascript=True, scripts=True, frames=True, embedded=True, safe_attrs_only=True) cleaner.kill_tags = [ "style", # remove style tags, not attrs "base", ] try: email_dict["body"] = Premailer(email_dict["body"]).transform() except Exception: # Yeah, a pretty wide catch, but Premailer likes to throw up everything and anything messages.warning(self.request, _("Part of this message could not be parsed - it may not display correctly")) try: email_dict["body"] = cleaner.clean_html(email_dict["body"]) except (etree.LxmlError, ValueError): if plain is not None and len(plain.body.data) > 0: email_dict["body"] = str(plain.body.data) email_dict["charset"] = plain.charset else: email_dict["body"] = "" email_dict["charset"] = "utf-8" plain_message = True messages.error(self.request, _("This email contained invalid HTML and could not be displayed")) self.headline = email_dict["subject"] # GET params for users with `ask_image` set in their profile if plain_message: # bypass image scrubber img_display = True ask_images = False elif "imgDisplay" in self.request.GET and int(self.request.GET["imgDisplay"]) == 1: img_display = True ask_images = False elif self.request.user.userprofile.flags.ask_images: img_display = False ask_images = True else: img_display = self.request.user.userprofile.flags.display_images ask_images = False # filter images if we need to if not img_display: try: tree = lxml_html.fromstring(email_dict["body"]) for img in tree.findall(".//img"): try: del img.attrib["src"] except KeyError: pass email_dict["body"] = etree.tostring(tree) except (etree.LxmlError, ValueError): if plain is not None and len(plain.body.data) > 0: email_dict["body"] = str(plain.body.data) email_dict["charset"] = plain.charset else: email_dict["body"] = "" email_dict["charset"] = "utf-8" # convert to unicode as late as possible email_dict["body"] = unicode(email_dict["body"], email_dict["charset"], errors="replace") context = super(EmailView, self).get_context_data(**kwargs) context.update({ "email": email_dict, "plain_message": plain_message, "attachments": attachments, "ask_images": ask_images, }) return context
from lxml.html import fragments_fromstring from PIL import Image, ImageDraw, ImageFont from rs_mailer import EmailSender import base64 import tempfile import requests from readability import Document from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.remove_tags = ['div', 'span'] cleaner.kill_tags = ['svg'] # This script will create an opf version of The Guardian (or The # Observer on Sunday) suitable for turning into a .mobi file for # copying to your Kindle. blacklisted_section_names = ['pictures'] get_paper_articles = False email_send = False sleep_seconds_after_api_call = 2 # Check the path of the directory where this script is located # to read keys and config files # (Ignore symbolic links)
HTML_CLEANER.comments = True HTML_CLEANER.embedded = False # True HTML_CLEANER.forms = False # True HTML_CLEANER.frames = False # True HTML_CLEANER.javascript = False # True HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = False # True HTML_CLEANER.style = False # HTML_CLEANER.remove_tags = ['a', 'abbr', 'acronym', 'address', 'big', 'cite', 'dd', 'font', 'ins', 'meta', 'span', 'small', 'sub', 'sup', 'wbr'] # 'center', 'table', 'tbody', 'td', 'th', 'tr', HTML_CLEANER.remove_tags = ['img'] HTML_CLEANER.kill_tags = ['aside', 'del'] # 'area', 'table' # 'header' CUT_EMPTY_ELEMS = { 'article', 'b', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'i', 'li', 'main', 'p', 'section', 'span', 'strong', 'td' } # 'meta', MANUALLY_CLEANED = [ 'audio', 'blink', 'button', 'canvas', 'embed', 'figure', 'footer', 'form', 'head', 'iframe', 'input', 'link', 'map', 'marquee', 'math', 'nav', 'noscript', 'object', 'picture', 'script', 'style', 'svg', 'time', 'video' ] # 'frame' 'frameset' 'source', 'img',
telefono = "".join(links[1].text_content().split()) fax = "".join(links[2].text_content().split()) if len(links[3].cssselect("a")[0].attrib['href']) > len('http://'): web = links[3].cssselect("a")[0].attrib['href'] else: web = "" return direccion, telefono, fax, web cleaner = Cleaner() cleaner.kill_tags = ['strong'] for i in range(1, 45): base_url = 'http://planetafan.com/cas/site/tiendas.asp?prov=0&loc=0&pag=' + str( i) html = scraperwiki.scrape(base_url) root = lxml.html.fromstring(html) links = root.cssselect("ul#listado-productos li") for link in links: record = {} name = link.cssselect("a")[0].text_content() card_link = link.cssselect("a")[0].attrib['href']
MIN_FILE_SIZE = 10 # Plausible dates # earliest possible year to take into account (inclusive) MIN_YEAR = 1995 # latest possible date LATEST_POSSIBLE = datetime.date.today() # latest possible year MAX_YEAR = datetime.date.today().year # HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html HTML_CLEANER = Cleaner() HTML_CLEANER.comments = False HTML_CLEANER.embedded = True HTML_CLEANER.forms = False HTML_CLEANER.frames = True HTML_CLEANER.javascript = True HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = True HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = False HTML_CLEANER.style = True HTML_CLEANER.kill_tags = [ 'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf', 'svg', 'video' ] # 'embed', 'figure', 'img', 'table'