def parseAndGetLinks(self): """StringIO是从内存中读取数据 DumbWriter将事件流转换为存文本文档 AbstractFormatter 类进行格式化 """ self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
def OpenURL(self, url): from htmllib import HTMLParser import formatter self.url = url m = re.match('http://([^/]+)(/\S*)\s*', url) if m: host = m.groups()[0] path = m.groups()[1] else: m = re.match('http://(\S+)\s*', url) if not m: # Invalid URL self.logprint("Invalid or unsupported URL: %s" % (url)) return host = m.groups()[0] path = '' f = self.RetrieveAsFile(host, path) if not f: self.logprint("Could not open %s" % (url)) return self.logprint("Receiving data...") data = f.read() tmp = open('hangman_dict.txt', 'w') fmt = formatter.AbstractFormatter(formatter.DumbWriter(tmp)) p = HTMLParser(fmt) self.logprint("Parsing data...") p.feed(data) p.close() tmp.close()
def parseAndGetLinks(self): # 创建一个基本的HTML解释器,可能需要单独一篇文章来说这句 self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) # 解析html文件,获取所有的连接(带有href的) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
def parseAndGetLinks(self): # pars HTML, save links self.parser = HTMLParser(AbstractFormatter( \ DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() print self.parser return self.parser.anchorlist
def create_plaintext_message(message): """ Create clean plain text version of email message Parse the html and remove style and javacript tags and then create a plain-text-message by parsing the html and attaching links as endnotes """ cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.kill_tags = ['style'] doc = message.decode('utf-8', 'ignore') to_clean = lxml.html.fromstring(doc) cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean)) plain_text_maxcols = 72 textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter( formatter.DumbWriter(textout, plain_text_maxcols)) parser = HTMLParser(formtext) parser.feed(cleaned_msg) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. counter = 0 anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for item in parser.anchorlist: counter += 1 if item.startswith('https://'): new_item = item.replace('https://', 'http://') else: new_item = item anchorlist += "[%d] %s\n" % (counter, new_item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist return text
def html2text(htmldata): # patch htmldata htmldata = htmldata.replace("<br/>", "<br>") fmt = HTMLtoTextFormatter() prs = HTMLParser(fmt) prs.feed(htmldata) return fmt.getText()
def parseAndGetLinks(self): self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) try: self.parser.feed(open(self.file).read()) self.parser.close() except IOError: pass return self.parser.anchorlist
def get_urls(url): data = urllib.urlopen(url).read() parser = HTMLParser( formatter.AbstractFormatter(formatter.DumbWriter( cStringIO.StringIO()))) parser.feed(data) parser.close() url_list = parser.anchorlist return url_list
def parse_links(self): 'Parse out the links found in downloaded HTML file' f = open(self.file, 'r') data = f.read() f.close() parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO()))) parser.feed(data) parser.close() return parser.anchorlist
def _clean_text(self, text): try: text = text.replace(" ", " ") text = text.strip() parser = HTMLParser(None) parser.save_bgn() parser.feed(text) return parser.save_end() except: return text
def parse_links(self): f = open(self.file, 'r') data = f.read() f.close() paeser = HTMLParser( formatter.AbstractFormatter( formatter.DumbWriter(cStringIO.StringIO()))) paeser.feed(data) paeser.close() return paeser.anchorlist
def html2text(html): f = StringIO() parser = HTMLParser(AbstractFormatter(DumbWriter(f))) try: parser.feed(html) except HTMLParseError: return '' else: parser.close() return f.getvalue()
def get_text_from_html( html_input ): "Strip tags and non-ascii characters from HTML input." my_stringio = StringIO.StringIO() # make an instance of this file-like string thing p = HTMLParser(AbstractFormatter(DumbWriter(my_stringio))) try: p.feed(html_input); p.close() #calling close is not usually needed, but let's play it safe except HTMLParseError: print '***HTML malformed***' #the html is badly malformed (or you found a bug) #return my_stringio.getvalue().replace('\xa0','') s = re.sub( r'[^\x00-\x7f]', r' ', my_stringio.getvalue() ) s = s.replace('\r\n',' ').replace('\n',' ') s = re.sub( ' +', ' ', s ) return s
def parseAndGetLinks(self): '''解析html页面,获取页面中的链接,并保存链接''' self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) #使用HTMLParser的方法进行处理 , StringIO是从内存中读取数据,DumbWriter将事件流转换为存文本文档。 self.parser.feed(open(self.file).read()) #将self.file文件打开,并一次性读入上面定的文件中 self.parser.close() print 'self.parser.anchorlist --> ', self.parser.anchorlist return self.parser.anchorlist #anchorlist 记录href 地址
def parse_link(seld): 'Parse out the link' f = open('seld.file', 'r') data = f.read() f.close() parser = HTMLParser( formatter.AbstractFormatter( formatter.DumbWriter(cStringIO.StringIO()))) parser.feed(data) parser.close() return parser.anchorlist
def parse_links(self): """fetch all links from page """ f = open(self.save_file, 'r') data = f.read() f.close() parser = HTMLParser( formatter.AbstractFormatter( formatter.DumbWriter(cStringIO.StringIO()))) parser.feed(data) parser.close() return parser.anchorlist
def parseAndGetLinks(self, html_string): try: self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(html_string) self.parser.close() links = [] for eachLink in self.parser.anchorlist: if eachLink[:4] != "http" and find(eachLink, "://") == -1: eachLink = urljoin(self.base_url, eachLink) links.append(eachLink) return links except IOError: return []
def parse_html(self, html): from StringIO import StringIO from formatter import (AbstractFormatter, DumbWriter) from htmllib import HTMLParser _html = re.sub(self.notrans_tag, r" \1 ", html) buf = StringIO() p = HTMLParser(AbstractFormatter(DumbWriter(buf))) p.feed(_html) _sub = re.sub(self.whitespaces, " ", buf.getvalue()) # FIXME: how can zerowidth be removed more simply? _sub = re.sub(self.zerowidth, "", _sub) _sub = re.sub(self.colon, r"\1", _sub) return _sub
def get_plain_from_html(html): """extract plain text from html >>> test_html = "<div><h1>Hey<h1><p>This is some text</p></div>" >>> get_plain_from_html(test_html) '\\nHey\\n\\nThis is some text' """ from htmllib import HTMLParser # import here to avoid high startup cost textout = StringIO() formtext = AbstractFormatter(DumbWriter(textout)) parser = HTMLParser(formtext) parser.feed(html) parser.close() return textout.getvalue()
def compactor(dev_filename, rel_filename): # Use compactor to generate release version. echo('Compacting: %s -> %s' % (dev_filename, rel_filename)) source_data = open(dev_filename, 'r').read() try: # Verify that the html file is correct htmlparser = HTMLParser(NullFormatter()) htmlparser.feed(source_data) htmlparser.close() # Now try to minify output_file = open(rel_filename, 'wb') compactor = HTMLMinifier(output_file.write, True) compactor.feed(source_data) compactor.close() output_file.close() except HTMLParseError as e: error(str(e)) exit(1)
def collectURLSFromPage(page): """ This returns a list of URLS that come from a certain page. Useful for spiders. It takes just a string as an argument. """ resultList = [] if page == "": #nothing to parse, so nothing to return return resultList #print "Doing form parser" if page.count("<form") > 0: otherlist = daveFormParse(page) for key in otherlist: resultList.append(key) pass #DEBUG #return resultList #print "Doing RAW Parser" spamList = rawParse(page) for key in spamList: resultList.append(key) pass #This needs to be documented somehow, but I have no idea what it does try: parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) parser.feed(page) parser.close() except: #print "DEBUG: Caught an exception trying to parse that html file." #print "(Not sure why this happens - you'll have to crawl this page manually)" return resultList #print "Adding HTML Parser data" for key in parser.anchorlist: resultList.append(key) pass return resultList
def _create_plaintext_message(self, text): """ Create a plain-text-message by parsing the html and attaching links as endnotes """ plain_text_maxcols = 72 textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter( formatter.DumbWriter(textout, plain_text_maxcols)) parser = HTMLParser(formtext) parser.feed(text) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for counter, item in enumerate(parser.anchorlist): anchorlist += "[{0:d}] {1:s}\n".format(counter, item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist return text
def parseAndGetLinks(self): # parse HTML, save links self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) r = self.download() if r: print '________' try: try: s = r.read(50000) except socket.error as e: print "***************************socket error***************************", e return [] self.parser.feed(s) print '------------------' r.close() print '***************************' except HTMLParseError: print 'get links error\n' return [] self.parser.close() return self.parser.anchorlist
def collectURLSFromPage(page): resultList = [] #print "Doing form parser" if page.count("<form") > 0: otherlist = daveFormParse(page) for key in otherlist: resultList.append(key) pass #DEBUG #return resultList #print "Doing RAW Parser" spamList = rawParse(page) for key in spamList: resultList.append(key) pass #the whole "AbstractFormater()" line is a bunch of crap I copied #That needs to be documented somehow, but I have no idea what it does try: parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) parser.feed(page) parser.close() except: #print "DEBUG: Caught an exception trying to parse that html file." #print "(Not sure why this happens - you'll have to crawl this page manually)" return resultList #print "Adding HTML Parser data" for key in parser.anchorlist: resultList.append(key) pass return resultList
#coding:utf-8 import urllib2 from htmllib import HTMLParser from formatter import NullFormatter import os import re url_name = "http://b.hatena.ne.jp/hotentry" html_data = urllib2.urlopen(url_name) parser = HTMLParser(NullFormatter()) try: parser.feed(html_data.read()) except TypeError: print "type error" pat = re.compile("^http.*") for link in parser.anchorlist: x = pat.search(link) if x is not None: print x.group(0)
def parseAndGetLinks(self):#分析页面获得url self.parser=HTMLParser(AbstractFormatter(DumbWriter(StringIO))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist #锚链接列表
def parseAndGetLinks(self): # parse HTML, save links self.parser = HTMLParser(AbstractFormatter(\ DumbWriter(StringIO())))
def unescape(data): p = HTMLParser(None) p.save_bgn() p.feed(data) return p.save_end()
def parseAndGetLinks(self): self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(urlopen(self.url).read()) self.parser.close() return self.parser.anchorlist
def html2text(html): output = StringIO() writer = DumbWriter(output) p = HTMLParser(AbstractFormatter(writer)) p.feed(toText(html)) return toText(output.getvalue())