def create_plaintext_message(message): """ Create clean plain text version of email message Parse the html and remove style and javacript tags and then create a plain-text-message by parsing the html and attaching links as endnotes """ cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.kill_tags = ['style'] doc = message.decode('utf-8', 'ignore') to_clean = lxml.html.fromstring(doc) cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean)) plain_text_maxcols = 72 textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter( formatter.DumbWriter(textout, plain_text_maxcols)) parser = HTMLParser(formtext) parser.feed(cleaned_msg) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. counter = 0 anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for item in parser.anchorlist: counter += 1 if item.startswith('https://'): new_item = item.replace('https://', 'http://') else: new_item = item anchorlist += "[%d] %s\n" % (counter, new_item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist return text
def test(args=None): import sys import formatter if not args: args = sys.argv[1:] silent = args and args[0] == '-s' if silent: del args[0] if args: fn = args[0] else: fn = 'test.html' if fn == '-': data = sys.stdin.read() else: try: with open(fn, 'rt') as fh: data = fh.read() except IOError as msg: print(fn, ":", msg) sys.exit(1) if silent: f = formatter.NullFormatter() else: f = formatter.AbstractFormatter(formatter.DumbWriter()) p = HTMLParser(f) p.feed(data) p.close()
def OpenURL(self, url): from htmllib import HTMLParser import formatter self.url = url m = re.match('http://([^/]+)(/\S*)\s*', url) if m: host = m.groups()[0] path = m.groups()[1] else: m = re.match('http://(\S+)\s*', url) if not m: # Invalid URL self.logprint("Invalid or unsupported URL: %s" % (url)) return host = m.groups()[0] path = '' f = self.RetrieveAsFile(host, path) if not f: self.logprint("Could not open %s" % (url)) return self.logprint("Receiving data...") data = f.read() tmp = open('hangman_dict.txt', 'w') fmt = formatter.AbstractFormatter(formatter.DumbWriter(tmp)) p = HTMLParser(fmt) self.logprint("Parsing data...") p.feed(data) p.close() tmp.close()
def test(args = None): import sys, formatter if not args: args = sys.argv[1:] silent = args and args[0] == '-s' if silent: del args[0] if args: file = args[0] else: file = 'test.html' if file == '-': f = sys.stdin else: try: f = open(file, 'r') except IOError as msg: print file, ':', msg sys.exit(1) data = f.read() if f is not sys.stdin: f.close() if silent: f = formatter.NullFormatter() else: f = formatter.AbstractFormatter(formatter.DumbWriter()) p = HTMLParser(f) p.feed(data) p.close()
def extractFromHTML(html, blur=5): """ Extracts text from HTML content. """ html = unicode(html) assert isinstance(html, unicode) # Create memory file. file = StringIO() # Convert html to text. f = formatter.AbstractFormatter(formatter.DumbWriter(file)) p = TextExtractor() p.pathBlur = blur p.feed(html) p.close() text = p.get_plaintext() # Remove stand-alone punctuation. text = re.sub("\s[\(\),;\.\?\!](?=\s)", " ", text).strip() # Compress whitespace. text = re.sub("[\n\s]+", " ", text).strip() # Remove consequetive dashes. text = re.sub("\-{2,}", "", text).strip() # Remove consequetive periods. text = re.sub("\.{2,}", "", text).strip() return text
def create_html_mail(subject, html, text=None, from_addr=None, to_addr=None, headers=None, encoding='UTF-8'): """Create a mime-message that will render HTML in popular MUAs, text in better ones. """ # Use DumbWriters word wrapping to ensure that no text line # is longer than plain_text_maxcols characters. plain_text_maxcols = 72 html = html.encode(encoding) if text is None: # Produce an approximate textual rendering of the HTML string, # unless you have been given a better version as an argument textout = StringIO.StringIO() formtext = formatter.AbstractFormatter( formatter.DumbWriter(textout, plain_text_maxcols)) parser = htmllib.HTMLParser(formtext) parser.feed(html) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. counter = 0 anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for item in parser.anchorlist: counter += 1 anchorlist += "[%d] %s\n" % (counter, item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist else: text = text.encode(encoding) # if we would like to include images in future, there should # probably be 'related' instead of 'mixed' msg = MIMEMultipart('mixed') # maybe later :) msg['From'] = Header("%s <%s>" % # (send_from_name, send_from), encoding) msg['Subject'] = Header(subject, encoding) msg['From'] = from_addr msg['To'] = to_addr msg['Date'] = formatdate(localtime=True) msg["Message-ID"] = email.Utils.make_msgid() if headers: for key, value in headers.items(): msg[key] = value msg.preamble = 'This is a multi-part message in MIME format.' alternatives = MIMEMultipart('alternative') msg.attach(alternatives) alternatives.attach(MIMEText(text, 'plain', _charset=encoding)) alternatives.attach(MIMEText(html, 'html', _charset=encoding)) return msg
def parse(self): # We're using the parser just to get the HREFs # We should also use it to e.g. respect <META NOFOLLOW> w = formatter.DumbWriter(StringIO()) f = formatter.AbstractFormatter(w) self.parser = htmllib.HTMLParser(f) self.parser.feed(self.body) self.parser.close()
def html2txt(htmlblock): import htmllib, formatter, StringIO s = StringIO.StringIO('') w = formatter.DumbWriter(s) f = formatter.AbstractFormatter(w) p = htmllib.HTMLParser(f) p.feed(htmlblock) return s.getvalue().strip()
def find_links(html): """return list of links in HTML""" writer = formatter.DumbWriter(StringIO()) f = formatter.AbstractFormatter(writer) parser = htmllib.HTMLParser(f) parser.feed(html) parser.close() return parser.anchorlist
def end_h4(self): if self.h4_title: f = StringIO.StringIO("") self.help_strings[self.h4_title] = f self.formatter.writer = formatter.DumbWriter(f, 800) self.h4 = False
def get_data(self): ''' Download data from Weather Underground website for a given stationid , a startyar, and an endyear. The html file is parsed and written as csv to a separate txt file for each day. [singleprocessing code, deprecated] ''' logger.info('Download data for stationid: ' + self.stationid + ' [start]') for td in utils.progressbar(range(0, (self.enddate - self.startdate) .days + 1), "Downloading: ", 60): # increase the date by 1 day for the next download current_date = self.startdate + timedelta(days=td) # set download url url = 'http://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID=' + \ self.stationid + '&day=' + str(current_date.day) + '&year=' + \ str(current_date.year) + '&month=' + \ str(current_date.month) + '&format=1' # define outputfile outputfile = self.stationid + '_' + str(current_date.year) \ + str(current_date.month).zfill(2) + \ str(current_date.day).zfill(2) + '.txt' # check if we want to keep previous downloaded files if self.keep: if os.path.exists(os.path.join(self.outputdir, outputfile)): # check if filesize is not null if os.path.getsize(os.path.join(self.outputdir, outputfile)) > 0: # file exists and is not null, continue next iteration continue else: # file exists but is null, so remove and redownload os.remove(os.path.join(self.outputdir, outputfile)) elif os.path.exists(os.path.join(self.outputdir, outputfile)): os.remove(os.path.join(self.outputdir, outputfile)) # open outputfile with open(os.path.join(self.outputdir, outputfile), 'wb') as outfile: # open and read the url handler = urllib2.urlopen(url) content = handler.read() # convert spaces to non-breaking spaces content = content.replace(' ', ' ') # Removing all the HTML tags from the file outstream = cStringIO.StringIO() parser = htmllib.HTMLParser( formatter.AbstractFormatter( formatter.DumbWriter(outstream))) parser.feed(content) # convert spaces back to regular whitespace (' ') content = outstream.getvalue().replace('\xa0', ' ') # write output outfile.write(content) # close handler and outstream outstream.close() handler.close() logger.info('Download data for stationid: ' + self.stationid + ' [completed]')
def find_links(html): """Return a list of links in html.""" # We're using the parser just to get the HREFs writer = formatter.DumbWriter(StringIO()) f = formatter.AbstractFormatter(writer) parser = htmllib.HTMLParser(f) parser.feed(html) parser.close() return parser.anchorlist
def get_urls(url): data = urllib.urlopen(url).read() parser = HTMLParser( formatter.AbstractFormatter(formatter.DumbWriter( cStringIO.StringIO()))) parser.feed(data) parser.close() url_list = parser.anchorlist return url_list
def textFromHtml(self, html): textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter(formatter.DumbWriter(textout)) parser = htmllib.HTMLParser(formtext) parser.feed(html) parser.close() text = textout.getvalue() del textout, formtext, parser return text
def __init__(self, prn=0): if prn: format = formatter.AbstractFormatter(formatter.DumbWriter()) else: format = formatter.NullFormatter() htmllib.HTMLParser.__init__(self, format) self.depth = 0 self.stack = []
def parse_links(self): 'Parse out the links found in downloaded HTML file' f = open(self.file, 'r') data = f.read() f.close() parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO()))) parser.feed(data) parser.close() return parser.anchorlist
def start_a(self, attrs): dattrs = dict(attrs) if "name" in dattrs: h4_title = dattrs["name"] f = StringIO.StringIO("") self.help_strings[h4_title] = f self.formatter.writer = formatter.DumbWriter(f, 800)
def parse_links(self): f = open(self.file, 'r') data = f.read() f.close() paeser = HTMLParser( formatter.AbstractFormatter( formatter.DumbWriter(cStringIO.StringIO()))) paeser.feed(data) paeser.close() return paeser.anchorlist
def decodeHTMLCharacterEntities(self, code): #print code outputstring = StringIO.StringIO() w = formatter.DumbWriter(outputstring, maxcol=9999999) # plain text f = formatter.AbstractFormatter(w) p = htmllib.HTMLParser(f) p.feed(code) p.close() #print outputstring.getvalue() return (outputstring.getvalue())
def parse_link(seld): 'Parse out the link' f = open('seld.file', 'r') data = f.read() f.close() parser = HTMLParser( formatter.AbstractFormatter( formatter.DumbWriter(cStringIO.StringIO()))) parser.feed(data) parser.close() return parser.anchorlist
def parse_links(self): #解析刚下载下来的页面链接 f = open(self.file, 'r') #读取下载的页面 data = f.read() f.close() parser = htmllib.HTMLParser( formatter.AbstractFormatter( #AbstractFormatter用来解析数据 formatter.DumbWriter(cStringIO.StringIO())) ) #DumbWriter用来输出内容,cStringIO保障不输出到标准输出(最好输出到文件) parser.feed(data) parser.close() return parser.anchorlist #返回解析后的列表
def urlparser(): import urllib.request, formatter, sys, html from html.parser import HTMLParser with urllib.request.urlopen(url) as response: data = str(response.read()) response.close() format = formatter.AbstractFormatter(formatter.DumbWriter( sys.stdout)) ptext = HTMLParser(format) ptext.feed(data) ptext.close()
def parse_links(self): #解析刚下载下来的页面链接 f = codecs.open(self.file, 'rb', 'utf-8') #读取下载的页面 data = f.read() #.decode("utf-8") f.close() parser = HTMLParser( formatter.AbstractFormatter( #AbstractFormatter用来解析数据 formatter.DumbWriter(io.StringIO())) ) #DumbWriter用来输出内容,cStringIO保障不输出到标准输出(最好输出到文件) parser.feed(data) parser.close() return parser.anchorlist #返回解析后的列表
def parse_links(self): """fetch all links from page """ f = open(self.save_file, 'r') data = f.read() f.close() parser = HTMLParser( formatter.AbstractFormatter( formatter.DumbWriter(cStringIO.StringIO()))) parser.feed(data) parser.close() return parser.anchorlist
def get_content(self, document): """ Get the content or the text from the document. Document can be Html document or text file """ outstream = cStringIO.StringIO() parser = htmllib.HTMLParser( formatter.AbstractFormatter(formatter.DumbWriter(outstream))) parser.feed(document) content = outstream.getvalue() outstream.close() return content
def parseDOM(gb): w = formatter.DumbWriter() # plain text f = formatter.AbstractFormatter(w) for ff in os.listdir(os.path.join(hPATH)): command='python parser.py {} > {}'\ .format(shellquote(os.path.join(hPATH,ff)),shellquote(os.path.join(HOME,'html',gb+'_parsed','parsed_'+os.path.splitext(os.path.basename(os.path.join(hPATH,ff)))[0]+".txt"))) print command result=os.system(command)
def pretty_print(self): """Print a pretty (formatted) version of the HTML content. If the content is not text/html then it is just printed. """ if not self.headers['content-type'].lower().startswith('text/html'): print self.contents else: parser = htmllib.HTMLParser( formatter.AbstractFormatter(formatter.DumbWriter())) parser.feed(self.contents) parser.close()
def get_extra_information(self): text = "" if self.diff: self.diff = self.escape(self.diff) outstream = cStringIO.StringIO() p = htmllib.HTMLParser( formatter.AbstractFormatter(formatter.DumbWriter(outstream))) p.feed(self.diff) self.diff = outstream.getvalue() outstream.close() text = self.diff.replace("&", "&") return text
def apply_htmlparser(html, maxcol=MAXCOL, codec='utf8'): """This function extracts from the HTML string by passing it through a htmllib.HTMLParser instance (slightly modified for Unicode support). Adapted from http://www.bazza.com/~eaganj/weblog/2006/04/04/printing-html-as-text-in-python-with-unicode/ @type html: unicode @param html: The HTML to extract text from (eg. u"<html><body><h1>Hello</h1>...") @type maxcol: int @param maxcol: The maxcol value to passed to formatter.DumbWriter() @type codec: str (passed to codecs.lookup()) @param codec: The codec to use to parse the HTML. @rtype : str @return: The text parsed from the HTML.""" class UnicodeHTMLParser(htmllib.HTMLParser): """HTMLParser that can handle unicode charrefs""" entitydefs = dict([ (k, unichr(v)) for k, v in htmlentitydefs.name2codepoint.items() ]) def handle_charref(self, name): """Override builtin version to return unicode instead of binary strings for 8-bit chars.""" try: n = int(name) except ValueError: self.unknown_charref(name) return if not 0 <= n <= 255: self.unknown_charref(name) return if 0 <= n <= 127: self.handle_data(chr(n)) else: self.handle_data(unichr(n)) sio = StringIO() encoder, decoder, reader, writer = codecs.lookup(codec) codecio = codecs.StreamReaderWriter(sio, reader, writer, 'replace') writer = formatter.DumbWriter(codecio, maxcol) prettifier = formatter.AbstractFormatter(writer) parser = UnicodeHTMLParser(prettifier) parser.feed(html) parser.close() codecio.seek(0) result = codecio.read() sio.close() codecio.close() return result
def chapter1_25(): # 生成写入、格式化、解析对象,根据需要将他们连接起来 myWriter = formatter.DumbWriter() if sys.stdout.isatty(): myFormatter = TtyFormatter(myWriter) else: myFormatter = formatter.AbstractFormatter(myWriter) myParser = htmllib.HTMLParser(myFormatter) # 将标准输入和终端操作提供给解析器 myParser.feed(sys.stdin.read()) myParser.close() pass