def unescapeHTML(s): if '&' in s: p = htmllib.HTMLParser(None) p.save_bgn() p.feed(s) return p.save_end() return s
def format(self, text): writer = SimpleWriter() format = formatter.AbstractFormatter(writer) parser = htmllib.HTMLParser(format) parser.feed(text) parser.close() return writer
def unescape(self, s): """解码html转义""" p = htmllib.HTMLParser(None) p.save_bgn() p.feed(s) return p.save_end()
def unescape(s): if not s: return s p = htmllib.HTMLParser(None) p.save_bgn() p.feed(s) return p.save_end()
def prep_text(intext): intext = intext.replace("\\r", ' ') #get rid of the line returns #convert the entities back into characters p = htmllib.HTMLParser(None) p.save_bgn() p.feed(intext) return p.save_end()
def __get_unescape_u(text_u): """get unesace unicode html""" html_parser = htmllib.HTMLParser(None) html_parser.save_bgn() html_parser.feed(text_u) unescaped_u = html_parser.save_end() return unescaped_u
def unescape_entities(html_string): """Removes HTML or XML character references and entities from a html string (see http://wiki.python.org/moin/EscapingHtml)""" parser = htmllib.HTMLParser(None) parser.save_bgn() parser.feed(html_string.encode("iso8859-1")) return parser.save_end()
def create_html_mail(subject, html, text=None, from_addr=None, to_addr=None, headers=None, encoding='UTF-8'): """Create a mime-message that will render HTML in popular MUAs, text in better ones. """ # Use DumbWriters word wrapping to ensure that no text line # is longer than plain_text_maxcols characters. plain_text_maxcols = 72 html = html.encode(encoding) if text is None: # Produce an approximate textual rendering of the HTML string, # unless you have been given a better version as an argument textout = StringIO.StringIO() formtext = formatter.AbstractFormatter( formatter.DumbWriter(textout, plain_text_maxcols)) parser = htmllib.HTMLParser(formtext) parser.feed(html) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. counter = 0 anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for item in parser.anchorlist: counter += 1 anchorlist += "[%d] %s\n" % (counter, item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist else: text = text.encode(encoding) # if we would like to include images in future, there should # probably be 'related' instead of 'mixed' msg = MIMEMultipart('mixed') # maybe later :) msg['From'] = Header("%s <%s>" % # (send_from_name, send_from), encoding) msg['Subject'] = Header(subject, encoding) msg['From'] = from_addr msg['To'] = to_addr msg['Date'] = formatdate(localtime=True) msg["Message-ID"] = email.Utils.make_msgid() if headers: for key, value in headers.items(): msg[key] = value msg.preamble = 'This is a multi-part message in MIME format.' alternatives = MIMEMultipart('alternative') msg.attach(alternatives) alternatives.attach(MIMEText(text, 'plain', _charset=encoding)) alternatives.attach(MIMEText(html, 'html', _charset=encoding)) return msg
def parse(self): # We're using the parser just to get the HREFs # We should also use it to e.g. respect <META NOFOLLOW> w = formatter.DumbWriter(StringIO()) f = formatter.AbstractFormatter(w) self.parser = htmllib.HTMLParser(f) self.parser.feed(self.body) self.parser.close()
def unescapeHTML(s): if reHTMLEscape.search(s) is not None: p = htmllib.HTMLParser(None) p.save_bgn() p.feed(s) return p.save_end() else: return s
def unescape(s): """ remplace les sequences d'echappement par leurs caracteres equivalent """ p = htmllib.HTMLParser(None) p.save_bgn() p.feed(s) return p.save_end()
def find_links(html): """return list of links in HTML""" writer = formatter.DumbWriter(StringIO()) f = formatter.AbstractFormatter(writer) parser = htmllib.HTMLParser(f) parser.feed(html) parser.close() return parser.anchorlist
def extract(self,htmldata, docno,url): try: parser = htmllib.HTMLParser(formatter.NullFormatter()) parser.feed(htmldata) return parser.anchorlist except Exception,ex: pass return []
def getLinkByHTML2(html): format = formatter.AbstractFormatter(formatter.NullWriter()) ptext = htmllib.HTMLParser(format) ptext.feed(html) for link in ptext.anchorlist: print(link) return link return ""
def unescape(s): p = htmllib.HTMLParser(None) p.save_bgn() try: p.feed(s) except: return s return p.save_end()
def html2txt(htmlblock): import htmllib, formatter, StringIO s = StringIO.StringIO('') w = formatter.DumbWriter(s) f = formatter.AbstractFormatter(w) p = htmllib.HTMLParser(f) p.feed(htmlblock) return s.getvalue().strip()
def unescape(s): p = htmllib.HTMLParser(formatter.NullFormatter() ) # we need to preserve line breaks, nofill makes sure we don't # loose them p.nofill = True p.save_bgn() p.feed(s) return p.save_end().strip()
def unescape(s): p = htmllib.HTMLParser(None) p.save_bgn() try: p.feed(s) except: raise ValueError(s) return p.save_end()
def get_data(self): ''' Download data from Weather Underground website for a given stationid , a startyar, and an endyear. The html file is parsed and written as csv to a separate txt file for each day. [singleprocessing code, deprecated] ''' logger.info('Download data for stationid: ' + self.stationid + ' [start]') for td in utils.progressbar(range(0, (self.enddate - self.startdate) .days + 1), "Downloading: ", 60): # increase the date by 1 day for the next download current_date = self.startdate + timedelta(days=td) # set download url url = 'http://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID=' + \ self.stationid + '&day=' + str(current_date.day) + '&year=' + \ str(current_date.year) + '&month=' + \ str(current_date.month) + '&format=1' # define outputfile outputfile = self.stationid + '_' + str(current_date.year) \ + str(current_date.month).zfill(2) + \ str(current_date.day).zfill(2) + '.txt' # check if we want to keep previous downloaded files if self.keep: if os.path.exists(os.path.join(self.outputdir, outputfile)): # check if filesize is not null if os.path.getsize(os.path.join(self.outputdir, outputfile)) > 0: # file exists and is not null, continue next iteration continue else: # file exists but is null, so remove and redownload os.remove(os.path.join(self.outputdir, outputfile)) elif os.path.exists(os.path.join(self.outputdir, outputfile)): os.remove(os.path.join(self.outputdir, outputfile)) # open outputfile with open(os.path.join(self.outputdir, outputfile), 'wb') as outfile: # open and read the url handler = urllib2.urlopen(url) content = handler.read() # convert spaces to non-breaking spaces content = content.replace(' ', ' ') # Removing all the HTML tags from the file outstream = cStringIO.StringIO() parser = htmllib.HTMLParser( formatter.AbstractFormatter( formatter.DumbWriter(outstream))) parser.feed(content) # convert spaces back to regular whitespace (' ') content = outstream.getvalue().replace('\xa0', ' ') # write output outfile.write(content) # close handler and outstream outstream.close() handler.close() logger.info('Download data for stationid: ' + self.stationid + ' [completed]')
def getLinks(): website = urllib2.urlopen("http://www.profmcmmillan.com") data = website.read() website.close() Format = formatter.AbstractFormatter(formatter.NullWriter()) ptext = htmllib.HTMLParser(Format) ptext.feed(data) for link in ptext.anchorlist: print link
def textFromHtml(self, html): textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter(formatter.DumbWriter(textout)) parser = htmllib.HTMLParser(formtext) parser.feed(html) parser.close() text = textout.getvalue() del textout, formtext, parser return text
def find_links(html): """Return a list of links in html.""" # We're using the parser just to get the HREFs writer = formatter.DumbWriter(StringIO()) f = formatter.AbstractFormatter(writer) parser = htmllib.HTMLParser(f) parser.feed(html) parser.close() return parser.anchorlist
def unescape(self, s): """Quick and dirty unescaping.""" s = s.replace(" ","") p = htmllib.HTMLParser(None) p.save_bgn() p.feed(s) return p.save_end()
def decodeHTMLCharacterEntities(self, code): #print code outputstring = StringIO.StringIO() w = formatter.DumbWriter(outputstring, maxcol=9999999) # plain text f = formatter.AbstractFormatter(w) p = htmllib.HTMLParser(f) p.feed(code) p.close() #print outputstring.getvalue() return (outputstring.getvalue())
def unescape(s): """http://wiki.python.org/moin/EscapingHtml Example: >>> unescape("Norwegian<>Polish Dictionar TR'App") >>> Norwegian<>Polish Dictionar TR'App """ _htmlParser = htmllib.HTMLParser(None) _htmlParser.save_bgn() _htmlParser.feed(s) return _htmlParser.save_end()
def parse_links(self): 'Parse out the links found in downloaded HTML file' f = open(self.file, 'r') data = f.read() f.close() parser = htmllib.HTMLParser( formatter.AbstractFormatter(formatter.DumbWriter(io.StringIO()))) parser.feed(data) parser.close() return parser.anchorlist
def parse_links(self): #解析刚下载下来的页面链接 f = open(self.file, 'r') #读取下载的页面 data = f.read() f.close() parser = htmllib.HTMLParser( formatter.AbstractFormatter( #AbstractFormatter用来解析数据 formatter.DumbWriter(cStringIO.StringIO())) ) #DumbWriter用来输出内容,cStringIO保障不输出到标准输出(最好输出到文件) parser.feed(data) parser.close() return parser.anchorlist #返回解析后的列表
def get_extra_information(self): text = "" if self.diff: self.diff = self.escape(self.diff) outstream = cStringIO.StringIO() p = htmllib.HTMLParser( formatter.AbstractFormatter(formatter.DumbWriter(outstream))) p.feed(self.diff) self.diff = outstream.getvalue() outstream.close() text = self.diff.replace("&", "&") return text
def unescape(self, s): p = htmllib.HTMLParser(None) p.save_bgn() p.feed(str(s)) s = p.save_end() #~ s = s.replace('<','<') #~ s = s.replace('>','>') s = s.replace(' rel="nofollow"', '') #~ s = s.replace('"e;','"') s = s.replace('&', '%26') s = s.replace('[1]', '') return s
def pretty_print(self): """Print a pretty (formatted) version of the HTML content. If the content is not text/html then it is just printed. """ if not self.headers['content-type'].lower().startswith('text/html'): print self.contents else: parser = htmllib.HTMLParser( formatter.AbstractFormatter(formatter.DumbWriter())) parser.feed(self.contents) parser.close()