def set_and_parse_xhtml(self, xhtml=None): """ Takes a xhtml text string and parses it using the tkHTMLWriter/Parser system. """ # Ensure the input is ok: if xhtml is None: xhtml = self.Experiment.getWikiXhtml() if xhtml is None: xhtml = "" else: xhtml = xhtml.replace(' ', ' ') # prepare the text widget: self.text.config(state="normal") self.text.delete("1.0", "end") self.text.update_idletasks() if not xhtml: logger.debug("No xhtml, aborting...") self.text.config(state="disabled") return xhtml # Write the xhtml to the text widget: writer = tkHTMLWriter(self.text) fmt = formatter.AbstractFormatter(writer) parser = tkHTMLParser(fmt) parser.feed(xhtml) parser.close() # Finally, disable the text widget again self.text.config(state="disabled") logger.debug("(%s) text area updated with parsed/formatted html from string of length %s", self.__class__.__name__, len(xhtml) if xhtml else xhtml)
def OpenURL(self, url): from htmllib import HTMLParser import formatter self.url = url m = re.match('http://([^/]+)(/\S*)\s*', url) if m: host = m.groups()[0] path = m.groups()[1] else: m = re.match('http://(\S+)\s*', url) if not m: # Invalid URL self.logprint("Invalid or unsupported URL: %s" % (url)) return host = m.groups()[0] path = '' f = self.RetrieveAsFile(host, path) if not f: self.logprint("Could not open %s" % (url)) return self.logprint("Receiving data...") data = f.read() tmp = open('hangman_dict.txt', 'w') fmt = formatter.AbstractFormatter(formatter.DumbWriter(tmp)) p = HTMLParser(fmt) self.logprint("Parsing data...") p.feed(data) p.close() tmp.close()
def __init__(self, viewer, reload=0): global _inited self.viewer = viewer self.reload = reload self.context = self.viewer.context self.app = self.context.app self.load_dingbat = self.app.load_dingbat self.loaded = [] self.current_map = None self.target = None self.formatter_stack = [] fmt = formatter.AbstractFormatter(self.viewer) HTMLParser.__init__(self, fmt) self.push_formatter(fmt) if not _inited: _inited = 1 init_module(self.app.prefs) self._ids = {} # Hackery so reload status can be reset when all applets are loaded import AppletLoader self.reload1 = self.reload and AppletLoader.set_reload(self.context) if self.reload1: self.reload1.attach(self) if self.app.prefs.GetBoolean('parsing-html', 'strict'): self.sgml_parser.restrict(0) # Information from <META ... CONTENT="..."> is collected here. # Entries are KEY --> [(NAME, HTTP-EQUIV, CONTENT), ...], where # KEY is (NAME or HTTP-EQUIV). self._metadata = {}
def __init__(self, parent, startUrlCallback=None, endUrlCallback=None, enterLinkCallback=None, leaveLinkCallback=None, *args, **kw): self.startUrlCallback = startUrlCallback self.endUrlCallback = endUrlCallback self.enterLinkCallback = enterLinkCallback self.leaveLinkCallback = leaveLinkCallback self.initTags() self.protocol = '' self.location = '' self.path = '' self.dir = '' formatter.NullWriter.__init__(self) self.formatter = formatter.AbstractFormatter(self) htmllib.HTMLParser.__init__(self, self.formatter) ScrolledText.__init__(self, parent, *args, **kw) self.text_area.config(font=(font_family, default_size)) self.bind('<Enter>', lambda event: self.focus()) self.bind('<KeyPress-Prior>', lambda event: self.pageChange(-1)) self.bind('<KeyPress-Next>', lambda event: self.pageChange(1))
def test(args = None): import sys, formatter if not args: args = sys.argv[1:] silent = args and args[0] == '-s' if silent: del args[0] if args: file = args[0] else: file = 'test.html' if file == '-': f = sys.stdin else: try: f = open(file, 'r') except IOError as msg: print file, ':', msg sys.exit(1) data = f.read() if f is not sys.stdin: f.close() if silent: f = formatter.NullFormatter() else: f = formatter.AbstractFormatter(formatter.DumbWriter()) p = HTMLParser(f) p.feed(data) p.close()
def test(args=None): import sys import formatter if not args: args = sys.argv[1:] silent = args and args[0] == '-s' if silent: del args[0] if args: fn = args[0] else: fn = 'test.html' if fn == '-': data = sys.stdin.read() else: try: with open(fn, 'rt') as fh: data = fh.read() except IOError as msg: print(fn, ":", msg) sys.exit(1) if silent: f = formatter.NullFormatter() else: f = formatter.AbstractFormatter(formatter.DumbWriter()) p = HTMLParser(f) p.feed(data) p.close()
def extractFromHTML(html, blur=5): """ Extracts text from HTML content. """ html = unicode(html) assert isinstance(html, unicode) # Create memory file. file = StringIO() # Convert html to text. f = formatter.AbstractFormatter(formatter.DumbWriter(file)) p = TextExtractor() p.pathBlur = blur p.feed(html) p.close() text = p.get_plaintext() # Remove stand-alone punctuation. text = re.sub("\s[\(\),;\.\?\!](?=\s)", " ", text).strip() # Compress whitespace. text = re.sub("[\n\s]+", " ", text).strip() # Remove consequetive dashes. text = re.sub("\-{2,}", "", text).strip() # Remove consequetive periods. text = re.sub("\.{2,}", "", text).strip() return text
def create_plaintext_message(message): """ Create clean plain text version of email message Parse the html and remove style and javacript tags and then create a plain-text-message by parsing the html and attaching links as endnotes """ cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.kill_tags = ['style'] doc = message.decode('utf-8', 'ignore') to_clean = lxml.html.fromstring(doc) cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean)) plain_text_maxcols = 72 textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter( formatter.DumbWriter(textout, plain_text_maxcols)) parser = HTMLParser(formtext) parser.feed(cleaned_msg) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. counter = 0 anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for item in parser.anchorlist: counter += 1 if item.startswith('https://'): new_item = item.replace('https://', 'http://') else: new_item = item anchorlist += "[%d] %s\n" % (counter, new_item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist return text
def format(self, text): writer = SimpleWriter() format = formatter.AbstractFormatter(writer) parser = htmllib.HTMLParser(format) parser.feed(text) parser.close() return writer
def create_html_mail(subject, html, text=None, from_addr=None, to_addr=None, headers=None, encoding='UTF-8'): """Create a mime-message that will render HTML in popular MUAs, text in better ones. """ # Use DumbWriters word wrapping to ensure that no text line # is longer than plain_text_maxcols characters. plain_text_maxcols = 72 html = html.encode(encoding) if text is None: # Produce an approximate textual rendering of the HTML string, # unless you have been given a better version as an argument textout = StringIO.StringIO() formtext = formatter.AbstractFormatter( formatter.DumbWriter(textout, plain_text_maxcols)) parser = htmllib.HTMLParser(formtext) parser.feed(html) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. counter = 0 anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for item in parser.anchorlist: counter += 1 anchorlist += "[%d] %s\n" % (counter, item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist else: text = text.encode(encoding) # if we would like to include images in future, there should # probably be 'related' instead of 'mixed' msg = MIMEMultipart('mixed') # maybe later :) msg['From'] = Header("%s <%s>" % # (send_from_name, send_from), encoding) msg['Subject'] = Header(subject, encoding) msg['From'] = from_addr msg['To'] = to_addr msg['Date'] = formatdate(localtime=True) msg["Message-ID"] = email.Utils.make_msgid() if headers: for key, value in headers.items(): msg[key] = value msg.preamble = 'This is a multi-part message in MIME format.' alternatives = MIMEMultipart('alternative') msg.attach(alternatives) alternatives.attach(MIMEText(text, 'plain', _charset=encoding)) alternatives.attach(MIMEText(html, 'html', _charset=encoding)) return msg
def parse(self): # We're using the parser just to get the HREFs # We should also use it to e.g. respect <META NOFOLLOW> w = formatter.DumbWriter(StringIO()) f = formatter.AbstractFormatter(w) self.parser = htmllib.HTMLParser(f) self.parser.feed(self.body) self.parser.close()
def renderOn(self, aPiddleCanvas): '''draw the text with aPiddleCanvas jjk 02/01/00''' writer = _HtmlPiddleWriter(self, aPiddleCanvas) fmt = formatter.AbstractFormatter(writer) parser = _HtmlParser(fmt) parser.feed(self.html) parser.close()
def print_text(self, file=sys.stdout): w = Writer(file) # plain text f = formatter.AbstractFormatter(w) errs = [] self.format(f, errs) if errs: for x in errs: ctx.ui.warning(x)
def html2txt(htmlblock): import htmllib, formatter, StringIO s = StringIO.StringIO('') w = formatter.DumbWriter(s) f = formatter.AbstractFormatter(w) p = htmllib.HTMLParser(f) p.feed(htmlblock) return s.getvalue().strip()
def find_links(html): """return list of links in HTML""" writer = formatter.DumbWriter(StringIO()) f = formatter.AbstractFormatter(writer) parser = htmllib.HTMLParser(f) parser.feed(html) parser.close() return parser.anchorlist
def content(path, archivo, output): f = formatter.AbstractFormatter(AlmostNullWriter()) parser = TocHlpHtmlParser(f) parser.path = path parser.ft = output fil = path + '/' + archivo parser.feed(open(fil).read()) parser.close()
def getLinkByHTML2(html): format = formatter.AbstractFormatter(formatter.NullWriter()) ptext = htmllib.HTMLParser(format) ptext.feed(html) for link in ptext.anchorlist: print(link) return link return ""
def get_data(self): ''' Download data from Weather Underground website for a given stationid , a startyar, and an endyear. The html file is parsed and written as csv to a separate txt file for each day. [singleprocessing code, deprecated] ''' logger.info('Download data for stationid: ' + self.stationid + ' [start]') for td in utils.progressbar(range(0, (self.enddate - self.startdate) .days + 1), "Downloading: ", 60): # increase the date by 1 day for the next download current_date = self.startdate + timedelta(days=td) # set download url url = 'http://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID=' + \ self.stationid + '&day=' + str(current_date.day) + '&year=' + \ str(current_date.year) + '&month=' + \ str(current_date.month) + '&format=1' # define outputfile outputfile = self.stationid + '_' + str(current_date.year) \ + str(current_date.month).zfill(2) + \ str(current_date.day).zfill(2) + '.txt' # check if we want to keep previous downloaded files if self.keep: if os.path.exists(os.path.join(self.outputdir, outputfile)): # check if filesize is not null if os.path.getsize(os.path.join(self.outputdir, outputfile)) > 0: # file exists and is not null, continue next iteration continue else: # file exists but is null, so remove and redownload os.remove(os.path.join(self.outputdir, outputfile)) elif os.path.exists(os.path.join(self.outputdir, outputfile)): os.remove(os.path.join(self.outputdir, outputfile)) # open outputfile with open(os.path.join(self.outputdir, outputfile), 'wb') as outfile: # open and read the url handler = urllib2.urlopen(url) content = handler.read() # convert spaces to non-breaking spaces content = content.replace(' ', ' ') # Removing all the HTML tags from the file outstream = cStringIO.StringIO() parser = htmllib.HTMLParser( formatter.AbstractFormatter( formatter.DumbWriter(outstream))) parser.feed(content) # convert spaces back to regular whitespace (' ') content = outstream.getvalue().replace('\xa0', ' ') # write output outfile.write(content) # close handler and outstream outstream.close() handler.close() logger.info('Download data for stationid: ' + self.stationid + ' [completed]')
def find_links(html): """Return a list of links in html.""" # We're using the parser just to get the HREFs writer = formatter.DumbWriter(StringIO()) f = formatter.AbstractFormatter(writer) parser = htmllib.HTMLParser(f) parser.feed(html) parser.close() return parser.anchorlist
def parse_links(self): 'Parse out the links found in downloaded HTML file' f = open(self.file, 'r') data = f.read() f.close() parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO()))) parser.feed(data) parser.close() return parser.anchorlist
def getLinks(): website = urllib2.urlopen("http://www.profmcmmillan.com") data = website.read() website.close() Format = formatter.AbstractFormatter(formatter.NullWriter()) ptext = htmllib.HTMLParser(Format) ptext.feed(data) for link in ptext.anchorlist: print link
def __init__(self, prn=0): if prn: format = formatter.AbstractFormatter(formatter.DumbWriter()) else: format = formatter.NullFormatter() htmllib.HTMLParser.__init__(self, format) self.depth = 0 self.stack = []
def textFromHtml(self, html): textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter(formatter.DumbWriter(textout)) parser = htmllib.HTMLParser(formtext) parser.feed(html) parser.close() text = textout.getvalue() del textout, formtext, parser return text
def get_urls(url): data = urllib.urlopen(url).read() parser = HTMLParser( formatter.AbstractFormatter(formatter.DumbWriter( cStringIO.StringIO()))) parser.feed(data) parser.close() url_list = parser.anchorlist return url_list
def parse_links(self): f = open(self.file, 'r') data = f.read() f.close() paeser = HTMLParser( formatter.AbstractFormatter( formatter.DumbWriter(cStringIO.StringIO()))) paeser.feed(data) paeser.close() return paeser.anchorlist
def decodeHTMLCharacterEntities(self, code): #print code outputstring = StringIO.StringIO() w = formatter.DumbWriter(outputstring, maxcol=9999999) # plain text f = formatter.AbstractFormatter(w) p = htmllib.HTMLParser(f) p.feed(code) p.close() #print outputstring.getvalue() return (outputstring.getvalue())
def urlparser2(): import urllib.request, urllib.parse, formatter from html.parser import HTMLParser response = urllib.request.urlopen(url) data = response.read() response.close() format = formatter.AbstractFormatter(formatter.NullFormatter()) ptext = HTMLParser(format) ptext.feed(data) for link in ptext.anchorlist: print(link)
def parse_link(seld): 'Parse out the link' f = open('seld.file', 'r') data = f.read() f.close() parser = HTMLParser( formatter.AbstractFormatter( formatter.DumbWriter(cStringIO.StringIO()))) parser.feed(data) parser.close() return parser.anchorlist
def parse_links(self): #解析刚下载下来的页面链接 f = open(self.file, 'r') #读取下载的页面 data = f.read() f.close() parser = htmllib.HTMLParser( formatter.AbstractFormatter( #AbstractFormatter用来解析数据 formatter.DumbWriter(cStringIO.StringIO())) ) #DumbWriter用来输出内容,cStringIO保障不输出到标准输出(最好输出到文件) parser.feed(data) parser.close() return parser.anchorlist #返回解析后的列表
def urlparser(): import urllib.request, formatter, sys, html from html.parser import HTMLParser with urllib.request.urlopen(url) as response: data = str(response.read()) response.close() format = formatter.AbstractFormatter(formatter.DumbWriter( sys.stdout)) ptext = HTMLParser(format) ptext.feed(data) ptext.close()