def get_plain_from_html(html): textout = StringIO() formtext = AbstractFormatter(DumbWriter(textout)) parser = HTMLParser(formtext) parser.feed(html) parser.close() return textout.getvalue()
class Retrive(object): def __init__(self, url): self.url = url self.file = self.filename(url) def filename(self, url, deffile='index.php'): parsedurl = urlparse(url,'http:', 0) path = parsedurl[1] + parsedurl[2] ext = splitext(path) if ext[1] == '': if path[-1] == '/': path += deffile else: path += '/' + deffile ldir = dirname(path) if sep != '/': ldir = replace(ldir, '/', sep) if not isdir(ldir): if exists(ldir): unlink(ldir) makedirs(ldir) return path def download(self): try: retval = urlretrieve(self.url, self.file) except IOError: retval = 'error' return retval def parseAndGetLinks(self): self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
class Retriever(object): def __init__(self,url): self.url=url self.file=self.filename(url) def filename(self,url,deffile='index.html'): parsedurl=urlparse(url,'http:',0) path=parsedurl[1]+parsedurl[2] #weibo.com+/gothack ext=splitext(path) #weibo.com/gothack , '' #split by . if ext[1]=='': if path[-1]=='/': path+=deffile else: path+='/'+deffile ldir=dirname(path) #weibo.com #before the last / if sep != '/': #default value is / ldir=replce(ldir,'/',sep) #replace / with sep #(string,old,new) if not isdir(ldir): if exists(ldir):unlink(ldir) makedirs(ldir) return path def download(self): try: retval=urlretrieve(self.url,self.file) except IOError: retval=('***ERROR: invalid URL "%s"' %self.url,) return retval def parseAndGetLinks(self): self.parser=HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
class Restriever(object): def __init__(self, url): self.url = url self.file = self.filename(url) def filename(self, url, deffile = 'index.htm'): parsedurl = urlparse(url, 'http:', 0) path = parsedurl[1] + parsedurl[2] ext = splitext(path) if ext[1] == '': if path[-1] == '/': path += deffile else: path += '/' + deffile ldir = dirname(path) if sep != '/': ldir = replace(ldir, '/', sep) if not isdir(ldir): if exists(ldir) : unlink(ldir) makedirs(ldir) return path def download(self): try: retval = urlretrieve(self.url, self.file) except IOError: retval = ('*** ERROR: invalid URL "%s"' % self.url) return retval def parseAndGetLinks(self): self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
class Retriever(object): #下载网页类 def __init__(self, url): self.url = url self.file = self.filename(url) def filename(self, url, deffile="index.htm"): parsedurl = urlparse(url, "http:", 0) #解析路径 path = parsedurl[1] + parsedurl[2] ext = splitext(path) if ext[1] == "": #如果没有文件,使用默认 if path[-1] == "/": path += deffile else: path += "/" + deffile ldir = dirname(path) #本地目录 if sep != "/": ldir = replace(ldir, "/", sep) if not isdir(ldir): #如果没有目录,创建一个 if exists(ldir): unlink(ldir) makedirs(ldir) return path def download(self): # 下载网页 try: retval = urlretrieve(self.url, self.file) except IOError: retval = ('***Error: invalid URL: "%s"' % self.url, ) return retval def parseAndGetLinks(self): #解析HTML,保存链接 self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
def OpenURL(self,url): from htmllib import HTMLParser import formatter self.url = url m = re.match('http://([^/]+)(/\S*)\s*', url) if m: host = m.groups()[0] path = m.groups()[1] else: m = re.match('http://(\S+)\s*', url) if not m: # Invalid URL self.logprint("Invalid or unsupported URL: %s" % (url)) return host = m.groups()[0] path = '' f = self.RetrieveAsFile(host,path) if not f: self.logprint("Could not open %s" % (url)) return self.logprint("Receiving data...") data = f.read() tmp = open('hangman_dict.txt','w') fmt = formatter.AbstractFormatter(formatter.DumbWriter(tmp)) p = HTMLParser(fmt) self.logprint("Parsing data...") p.feed(data) p.close() tmp.close()
def create_plaintext_message(message): """ Create clean plain text version of email message Parse the html and remove style and javacript tags and then create a plain-text-message by parsing the html and attaching links as endnotes """ cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.kill_tags = ['style'] doc = message.decode('utf-8', 'ignore') to_clean = lxml.html.fromstring(doc) cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean)) plain_text_maxcols = 72 textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter( formatter.DumbWriter(textout, plain_text_maxcols)) parser = HTMLParser(formtext) parser.feed(cleaned_msg) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. counter = 0 anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for item in parser.anchorlist: counter += 1 if item.startswith('https://'): new_item = item.replace('https://', 'http://') else: new_item = item anchorlist += "[%d] %s\n" % (counter, new_item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist return text
def create_plaintext_message(self, text): """ Create a plain-text-message by parsing the html and attaching links as endnotes """ plain_text_maxcols = 72 textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter(formatter.DumbWriter( textout, plain_text_maxcols)) parser = HTMLParser(formtext) parser.feed(text) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. counter = 0 anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for item in parser.anchorlist: counter += 1 if item.startswith('https://'): new_item = item.replace('https://', 'http://') else: new_item = item anchorlist += "[%d] %s\n" % (counter, new_item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist return text
class Retriever(object): def __init__(self,url): self.url=url self.file=self.filename(url) def filename(self,url,deffile='index.html'): """ 生成下载连接和文件名 """ full_url = "" if url.endswith(DOM_SUFFIX): full_url = url + '/' else: full_url = url parsedurl=urlparse(full_url,'http:',0) path=parsedurl[1]+parsedurl[2] ext=splitext(path) if ext[1]=='': if path[-1]=='/': path+=deffile else: path+='/'+deffile ldir=dirname(path) if sep!='/': ldir=replace(ldir,'/',sep) path=replace(path,'/',sep) if not isdir(ldir): if exists(ldir): #unlink(ldir) pass else: makedirs(ldir) print path return path def download(self): """ 下载文件 """ try: retval=urlretrieve(self.url,self.file) except IOError: retval=('***ERROR :invalid URL "%s"' %self.url,) return retval def parseAndGetLinks(self): """ 获取网页中的链接 """ #print 'Get Html Links from file:%s' % self.file #self.parser=HTMLParser(AbstractFormatter(DumbWriter(StringIO))) self.parser=HTMLParser(NullFormatter()) #self.parser.feed(open(self.file).read()) try: self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist except: print self.file + " error !" return []
class Retriever(htmllib.HTMLParser): #download Web Pages def __init__(self, url): self.url = url self.file = self.filename(url) def filename(self, url, deffile='index.htm'): parsedurl = urlparse(url,'http:',0) #parse path path = parsedurl[1] + parsedurl[2] text = splitext(path) if text[1]=='': #its not file use default if text[-1] == '/': path = path + deffile else: path = path + '/' + deffile print "PATH:%s" % path dir = dirname(path) if not isdir(dir): #create new archieve dir if necessary if exists(dir): unlink(dir) makedirs(dir) return path def download(self): #download web pages try: retval = urlretrieve(self.url,self.file) except IOError: retval =('***ERROR: invalid URL "%s"' % self.url) return retval def parseAndGetLinks(self): #Parse HTML self.parser = HTMLParser(AbstractFormatter(\ DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
def OpenURL(self, url): from htmllib import HTMLParser import formatter self.url = url m = re.match('http://([^/]+)(/\S*)\s*', url) if m: host = m.groups()[0] path = m.groups()[1] else: m = re.match('http://(\S+)\s*', url) if not m: # Invalid URL self.logprint("Invalid or unsupported URL: %s" % (url)) return host = m.groups()[0] path = '' f = self.RetrieveAsFile(host, path) if not f: self.logprint("Could not open %s" % (url)) return self.logprint("Receiving data...") data = f.read() tmp = open('hangman_dict.txt', 'w') fmt = formatter.AbstractFormatter(formatter.DumbWriter(tmp)) p = HTMLParser(fmt) self.logprint("Parsing data...") p.feed(data) p.close() tmp.close()
class Retriever(object):#下载网页类 def __init__(self,url): self.url = url self.file = self.filename(url) def filename(self,url,deffile ="index.htm"): parsedurl = urlparse(url,"http:",0) #解析路径 path = parsedurl[1] + parsedurl[2] ext = splitext(path) if ext[1] == "": #如果没有文件,使用默认 if path[-1] == "/": path += deffile else: path += "/" + deffile ldir = dirname(path) #本地目录 if sep != "/": ldir = replace(ldir,"/",sep) if not isdir(ldir): #如果没有目录,创建一个 if exists(ldir):unlink(ldir) makedirs(ldir) return path def download(self):# 下载网页 try: retval = urlretrieve(self.url,self.file) except IOError: retval = ('***Error: invalid URL: "%s"' % self.url,) return retval def parseAndGetLinks(self): #解析HTML,保存链接 self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
class Retriever(object): # download Web pages def __init__(self, url): self.url = url self.file = self.filename(url) def filename(self, url, deffile='index.htm'): parsedurl = urlparse(url, 'http:', 0) ## parse path path = parsedurl[1] + parsedurl[2] ext = splitext(path) if ext[1] == '': # no file, use default if path[-1] == '/': path += deffile else: path += '/' + deffile ldir = dirname(path) # local directory if sep != '/': # os-indep. path separator ldir = replace(ldir, '/', sep) if not isdir(ldir): # create archive dir if nec. if exists(ldir): unlink(ldir) makedirs(ldir) return path def parseAndGetLinks(self): self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
def create_plaintext_message(message): """ Create clean plain text version of email message Parse the html and remove style and javacript tags and then create a plain-text-message by parsing the html and attaching links as endnotes """ cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.kill_tags = ['style'] doc = message.decode('utf-8', 'ignore') to_clean = lxml.html.fromstring(doc) cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean)) plain_text_maxcols = 72 textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter(formatter.DumbWriter( textout, plain_text_maxcols)) parser = HTMLParser(formtext) parser.feed(cleaned_msg) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. counter = 0 anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for item in parser.anchorlist: counter += 1 if item.startswith('https://'): new_item = item.replace('https://', 'http://') else: new_item = item anchorlist += "[%d] %s\n" % (counter, new_item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist return text
class Retriever(object): # download Web pages def __init__(self, url): self.url = url self.file = self.filename(url) def filename(self, url, deffile = 'index.htm'): parsedurl = urlparse(url, 'http:', 0) # parse path path = parsedurl[1] + parsedurl[2] ext = splitext(path) if ext[1] == '': # no file, use default if path[-1] == '/': path += deffile else: path += '/' + deffile ldir = dirname(path) # local directory if sep != '/': ldir = replace(ldir, '/', sep) if not isdir(ldir): if exists(ldir): unlink(ldir) makedirs(ldir) return path def download(self): try: retval = urlretrieve(self.url, self.file) except IOError: retval = ('*** ERROR: invalid URL "%s"' % self.url) return retval def parseAndGetLinks(self): self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
class Retriever(object): # download Web pages def __init__(self, url): self.url = url self.file = self.filename(url) def filename(self, url, deffile='index.htm'): parsedurl = urlparse(url, 'http:', 0) path = parsedurl[1] + parsedurl[2] ext = splitext(path) if ext[1] == '': # no file, use default if path[-1] == '/': path += deffile else: path += '/' + deffile ldir = dirname(path) # local directory if sep != '/': # os-indep. path separator ldir = replace(ldir, '/', sep) if not isdir(ldir): # create archive dir if nec. if exists(ldir): unlink(ldir) makedirs(ldir) return path def download(self): # download Web page try: retval = urlretrieve(self.url, self.file) except IOError: retval = ('*** ERROR: invalid URL "%s"' % self.url, ) return retval def parseAndGetLinks(self): # parse HTML, save links self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
def create_plaintext_message(self, text): """ Create a plain-text-message by parsing the html and attaching links as endnotes Modified from EasyNewsletter/content/ENLIssue.py """ # This reflows text which we don't want, but it creates # parser.anchorlist which we do want. textout = StringIO.StringIO() formtext = formatter.AbstractFormatter(formatter.DumbWriter(textout)) parser = HTMLParser(formtext) parser.feed(text) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. counter = 0 anchorlist = "\n\n" + '----' + "\n\n" for item in parser.anchorlist: counter += 1 anchorlist += "[%d] %s\n" % (counter, item) # This reflows text: # text = textout.getvalue() + anchorlist # This just strips tags, no reflow text = html.fromstring(text).text_content() text += anchorlist del textout, formtext, parser, anchorlist return text
def parse_links(self): 'Parse out the links found in downloaded HTML file' f = open(self.file, 'r') data = f.read() f.close() parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO()))) parser.feed(data) parser.close() return parser.anchorlist
def get_urls(url): data = urllib.urlopen(url).read() parser = HTMLParser( formatter.AbstractFormatter(formatter.DumbWriter( cStringIO.StringIO()))) parser.feed(data) parser.close() url_list = parser.anchorlist return url_list
def html2text(html): f = StringIO() parser = HTMLParser(AbstractFormatter(DumbWriter(f))) try: parser.feed(html) except HTMLParseError: return '' else: parser.close() return f.getvalue()
def parse_links(self): f = open(self.file, 'r') data = f.read() f.close() paeser = HTMLParser( formatter.AbstractFormatter( formatter.DumbWriter(cStringIO.StringIO()))) paeser.feed(data) paeser.close() return paeser.anchorlist
def parse_links(self): 'Parse out the links found in downloaded HTML file' f = open(self.file, 'r') data = f.read() f.close() parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter( cStringIO.StringIO()))) parser.feed(data) parser.close() return parser.anchorlist
def parse_links(self): """fetch all links from page """ f = open(self.save_file, 'r') data = f.read() f.close() parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO()))) parser.feed(data) parser.close() return parser.anchorlist
class Retriever(object): def __init__(self,url): self.url = url #parse HTML ,save links def parseAndGetLinks(self): self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(urlopen(self.url).read()) self.parser.close() return self.parser.anchorlist
def parse_link(seld): 'Parse out the link' f = open('seld.file', 'r') data = f.read() f.close() parser = HTMLParser( formatter.AbstractFormatter( formatter.DumbWriter(cStringIO.StringIO()))) parser.feed(data) parser.close() return parser.anchorlist
def get_text_from_html( html_input ): "Strip tags and non-ascii characters from HTML input." my_stringio = StringIO.StringIO() # make an instance of this file-like string thing p = HTMLParser(AbstractFormatter(DumbWriter(my_stringio))) try: p.feed(html_input); p.close() #calling close is not usually needed, but let's play it safe except HTMLParseError: print '***HTML malformed***' #the html is badly malformed (or you found a bug) #return my_stringio.getvalue().replace('\xa0','') s = re.sub( r'[^\x00-\x7f]', r' ', my_stringio.getvalue() ) s = s.replace('\r\n',' ').replace('\n',' ') s = re.sub( ' +', ' ', s ) return s
def parse_links(self): """fetch all links from page """ f = open(self.save_file, 'r') data = f.read() f.close() parser = HTMLParser( formatter.AbstractFormatter( formatter.DumbWriter(cStringIO.StringIO()))) parser.feed(data) parser.close() return parser.anchorlist
def parse_links(self): f = open(self.file, "r") data = f.read() f.close() parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO()))) # parser = MyHTMLParser() parser.feed(data) parser.close() # 没有在模块中找到 anchorlist属性 # 返回页面中所有的锚点,href # 该属性在2.6后就被弃用了,如果想用可以通过自定义parser继承HTMLParser来实现,参见evernote.或者用第三方库beautifulsoup return parser.anchorlist
def parse(self, url): self.base = "" self.href = "" m = re.compile(".*/").match(url) if m != None: self.base = m.string[m.start(0):m.end(0)] result = urlfetch.fetch(url, headers = {'Cache-Control' : 'max-age=30', 'Pragma' : 'no-cache'} ) if result.status_code == 200: logging.debug(str(result.status_code) + " OK " + url) HTMLParser.feed(self, result.content) HTMLParser.close(self) else: logging.error(str(result.status_code) + " NG " + url)
class Retriever(object): def __init__(self, url): self.url = url self.file = self.filename(url) def filename(self, url, deffile='index.htm'): # 使用http协议解析url地址,将url解析为这样的五元组:(scheme, netloc, path, query, fragment) parsedurl = urlparse(url, 'http:', 0) # 将主机地址(netloc)和路径(path)合并起来作为存储文件的路径名 path = parsedurl[1] + parsedurl[2] # splitext将path分割为路径与后缀名,如(/path/to/file, txt) ext = splitext(path) # 如果后缀名为空,则给当前path添加默认的名字index.htm if ext[1] == '': # 若path结尾有“/”则直接添加index.htm,否则先添加“/” if path[-1] == '/': path += deffile else: path += '/' + deffile # 取出path中的目录部分(www.shellbye.com\\blog),然后与本地目录结合为一个目录('D:\\www.shellbye.com\\blog') ldir = dirname(abspath(path)) # 如果不是以“/”作为目录分割符的类Unix系统,比如Windows,则需要把“”替换为相应的目录分割符 # 因为类Unix系统的目录分割符与URI地址的分割符一样,所以可以不处理 if sep != '/': # os-indep. path separator ldir = replace(ldir, '/', sep) # 如果ldir目录不存在在创建 if not isdir(ldir): # create archive dir if nec. # 如果ldir存在但是不是目录则删除ldir。注,unlink即remove。 if exists(ldir): unlink(ldir) makedirs(ldir) return path def download(self): # download Web page try: # 下载self.url到self.file里 retval = urllib.urlretrieve(self.url, self.file) except IOError: retval = ('*** ERROR: invalid URL "%s"' % self.url, ) return retval def parseAndGetLinks(self): # 创建一个基本的HTML解释器,可能需要单独一篇文章来说这句 self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) # 解析html文件,获取所有的连接(带有href的) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
class Retriever(object):#download web page def __init__(self, url): self.url = url self.file = self.filename(url) #local filename ,directory def filename(self, url, deffile = "index.htm"): parseurl = urlparse(url, 'http:', 0) path = parseurl[1] + parseurl[2] #将路径转换为一个元组,如果为目录则第二个元素为空,如果文件则第二个元素为文件扩展名 #path = "D:/pycharmProjects/PythonWebApp/weblearning/Crawl.py" #print splitext(path) #('D:/pycharmProjects/PythonWebApp/weblearning/Crawl', '.py') ext = splitext(path) if ext[1] == '':#no file use default #tuple[-index],倒数第index个元素 if path[-1] == '/': path += deffile else: path += '/' + deffile #获取path的dir, # path=D:/pycharmProjects/PythonWebApp/webLearning # dir(path)=D:/pycharmProjects/PythonWebApp ldir = dirname(path)#local directory if sep != '/': #os-indep. path separator ldir = replace(ldir, '/', sep) if not isdir(ldir): #create archive dir if nec. #如果存在文件,则删除 if exists(ldir): unlink(ldir) makedirs(ldir) return path def download(self): #urlretrieve()返回一个2元组,(filename,mime_hdrs) try: retval = urlretrieve(self.url, self.file) except IOError: retval = ('*** ERROR: invalid URL "%s"' %self.url,) print 'erro,invalid url' return retval def parseAndGetLinks(self):#parse HTML , save links self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
def get_plain_from_html(html): """extract plain text from html >>> test_html = "<div><h1>Hey<h1><p>This is some text</p></div>" >>> get_plain_from_html(test_html) '\\nHey\\n\\nThis is some text' """ from htmllib import HTMLParser # import here to avoid high startup cost textout = StringIO() formtext = AbstractFormatter(DumbWriter(textout)) parser = HTMLParser(formtext) parser.feed(html) parser.close() return textout.getvalue()
class Retriever: ''' responsibilities: download, parse and queue ''' def __init__(self,url): ''' contructor of class.Instantiates the Retriver object and stores the url and filename as local attributes ''' self.url = url self.file = self.filename(url) def filename(self, url, deffile = 'index.html'): ''' input: url removes the http prefix index.html will be the default file name for storage of the url:this can be overridden by passing arguments to filename() ''' parsedurl = urlparse(url,"http:",0) #parse path path = parsedurl[1] + parsedurl[2] text = splitext(path) if text[1] == '': #no file, use default path = path + deffile else: path = path + '/' + deffile dir = dirname(path) if not isdir(dir): #create a new directory if necessary if exists(dir): unlink(dir) makedirs(dir) return path def download(self): #download web page try: retval = urlretrieve(self.url,self.file) except IOError: retval = ('***ERROR invalid url "%s"'%self.url,) return retval def parseAndGetLinks(self): #parse HTML and getlinks self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) #try: self.parser.feed(open(self.file).read()) #except HTMLParseError: self.parser.close() self.parser.close() return self.parser.anchorlist
def parse(self, url): self.base = "" self.href = "" m = re.compile(".*/").match(url) if m != None: self.base = m.string[m.start(0):m.end(0)] result = urlfetch.fetch(url, headers={ 'Cache-Control': 'max-age=30', 'Pragma': 'no-cache' }) if result.status_code == 200: logging.debug(str(result.status_code) + " OK " + url) HTMLParser.feed(self, result.content) HTMLParser.close(self) else: logging.error(str(result.status_code) + " NG " + url)
class Retriever(): # download web pages def __init__(self, url): self.url = url def download(self): # download web page print 'try to open url:', self.url, '\nthe true url process', string.split( self.url, '?')[0] try: retval = urlopen(string.split(self.url, '?')[0], None, 200) except urllib2.HTTPError as e: print "HTTPError", e return except socket.timeout as e: print "socket.timeout", e return except socket.error as e: print "socket.error", e return except urllib2.URLError as e: print "URLError: ", e return return retval def parseAndGetLinks(self): # parse HTML, save links self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) r = self.download() if r: print '________' try: try: s = r.read(50000) except socket.error as e: print "***************************socket error***************************", e return [] self.parser.feed(s) print '------------------' r.close() print '***************************' except HTMLParseError: print 'get links error\n' return [] self.parser.close() return self.parser.anchorlist
class LinkFinder(object): def __init__(self, base_url, page_url): self.base_url = base_url self.page_url = page_url def parseAndGetLinks(self, html_string): try: self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(html_string) self.parser.close() links = [] for eachLink in self.parser.anchorlist: if eachLink[:4] != "http" and find(eachLink, "://") == -1: eachLink = urljoin(self.base_url, eachLink) links.append(eachLink) return links except IOError: return []
def compactor(dev_filename, rel_filename): # Use compactor to generate release version. echo('Compacting: %s -> %s' % (dev_filename, rel_filename)) source_data = open(dev_filename, 'r').read() try: # Verify that the html file is correct htmlparser = HTMLParser(NullFormatter()) htmlparser.feed(source_data) htmlparser.close() # Now try to minify output_file = open(rel_filename, 'wb') compactor = HTMLMinifier(output_file.write, True) compactor.feed(source_data) compactor.close() output_file.close() except HTMLParseError as e: error(str(e)) exit(1)
class Retriever(): # download web pages def __init__(self, url): self.url = url def download(self): # download web page print 'try to open url:', self.url, '\nthe true url process', string.split(self.url, '?')[0] try: retval = urlopen(string.split(self.url, '?')[0], None, 200) except urllib2.HTTPError as e: print "HTTPError", e return except socket.timeout as e: print "socket.timeout", e return except socket.error as e: print "socket.error", e return except urllib2.URLError as e: print "URLError: ", e return return retval def parseAndGetLinks(self): # parse HTML, save links self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) r = self.download() if r: print '________' try: try: s = r.read(50000) except socket.error as e: print "***************************socket error***************************", e return [] self.parser.feed(s) print '------------------' r.close() print '***************************' except HTMLParseError: print 'get links error\n' return [] self.parser.close() return self.parser.anchorlist
def collectURLSFromPage(page): """ This returns a list of URLS that come from a certain page. Useful for spiders. It takes just a string as an argument. """ resultList = [] if page == "": #nothing to parse, so nothing to return return resultList #print "Doing form parser" if page.count("<form") > 0: otherlist = daveFormParse(page) for key in otherlist: resultList.append(key) pass #DEBUG #return resultList #print "Doing RAW Parser" spamList = rawParse(page) for key in spamList: resultList.append(key) pass #This needs to be documented somehow, but I have no idea what it does try: parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) parser.feed(page) parser.close() except: #print "DEBUG: Caught an exception trying to parse that html file." #print "(Not sure why this happens - you'll have to crawl this page manually)" return resultList #print "Adding HTML Parser data" for key in parser.anchorlist: resultList.append(key) pass return resultList
class Retriever(object): # download Web pages """docstring for Retriever""" def __init__(self, url): self.url = url self.file = self.filename(url) def filename(self, url, deffile='index.html'): parsedurl = urlparse(url, 'http:', 0) # parse path print '====PARSEDURL====',parsedurl if parsedurl[2] == '': path = parsedurl[1] + '/' else: path = parsedurl[1] + parsedurl[2] print '------PATH-----', path ext = splitext(path) print '-----EXT----', ext if ext[1] == '': # no file, use default if path[-1] == '/': path +=deffile else: path += '/' + deffile ldir = dirname(path) # local directory print '+++++++++++++++++', ldir if sep != '/': # os-indep. path separator ldir = replace(ldir, '/', sep) if not isdir(ldir): # create archive dir if nec. if exists(ldir): unlink(ldir) makedirs(ldir) return path def download(self): # download Web page try: retval = urlretrieve(self.url, self.file) except IOError: retval = ('*** ERROR: invalid URL "%s"' % self.url,) return retval def parseAndGetLinks(self): # parse HTML, save links # self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO))) self.parser = HTMLParser(AbstractFormatter(DumbWriter())) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
class LinkFinder(object): def __init__(self,base_url,page_url): self.base_url = base_url self.page_url = page_url def parseAndGetLinks(self,html_string): try: self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(html_string) self.parser.close() links = [] for eachLink in self.parser.anchorlist: if eachLink[:4] != "http" and find(eachLink, "://") == -1: eachLink = urljoin(self.base_url, eachLink) links.append(eachLink) return links except IOError: return []
class Retriever(object): # download Web pages def __init__(self, url): self.url = url self.file = self.filename(url) def filename(self, url, deffile='index.htm'): parsedurl = urlparse(url, 'http:', 0) # parse path path = parsedurl[1] + parsedurl[2] ext = splitext(path) print path if ext[1] == '': if path[-1] == '/': path += deffile else: path += '/' + deffile ldir = dirname(path) # local directory if sep != '/': # os-indep. path separator ldir = replace(ldir, ',', sep) if not isdir(ldir): # create archive dir if nec. if exists(ldir): unlink(ldir) print ldir, "aaaaaaaaa" makedirs(ldir) return path def download(self): # download Web page try: retval = urllib.urlretrieve(self.url, self.file) except IOError: retval = ('*** ERROR: invalid URL "%s"' % \ self.url, ) return retval def parseAndGetLinks(self): # pars HTML, save links self.parser = HTMLParser(AbstractFormatter( \ DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() print self.parser return self.parser.anchorlist
class Retriever(object): def __init__(self,url): self.url=url self.file=self.filename(url) def filename(self,url,deffile='index.htm'): parsedurl=urlparse(url,'http:',0) path=parsedurl[1]+parsedurl[2] ext=splitext(path) # 爬取的必须是静态html if ext[1]=='': if path[-1]=='/': path+=deffile else: path+='/'+deffile #print path # 建立文件目录 ldir=dirname(path) #print ldir #将url中的/转为windows的 if sep!='/': ldir=replace(ldir,'/',sep) if not isdir(ldir): if exists(ldir): unlink(ldir) makedirs(ldir) return path def download(self): try: retval=urlretrieve(self.url,self.file) except IOError: retval=('***error in url "%s"'%self.url) return retval def parseAndGetLink(self): #构造一个解析器 self.parser= HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
class Retriever(object): 'download web pages' def __init__(self, url): self.url = url self.file = self.filename(url) def filename(self, url, deffile='index.htm'): parsedurl = urlparse(url, 'http:', 0) path = parsedurl[1] + parsedurl[2] print path ext = splitext(path)#return (filename, extension) if ext[1] == '': #no file, use default if path[-1] == '/': path += deffile else: path += '/' + deffile ldir = dirname(path) #local directory if sep != '/': #os-indep.path separator ldir = replace(ldir, '/', sep) if not isdir(ldir): #create archive dir if nec if exists(ldir): unlink(ldir) makedirs(ldir) return path def download(self): 'download web page' try: retval = urlretrieve(self.url, self.file) except IOError: retval = ('*** ERROR: Invalid URL "%s"' %self.url) return retval def parseAndGetLinks(self): 'parse HTML, save links' self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
class Retriever(object): def __init__(self, url): self.url = url self.file = self.filename(url) print self.file def filename(self, url, deffile='index.htm'): parsedurl = urlparse(url, 'http', 0) path = parsedurl[1] + parsedurl[2] ext = splitext(path) #分解文件名的扩展名 print path,ext if ext[1] == '': if path[-1] == '/': path += deffile else: path += '/' + deffile print path ldir = dirname(path) #应该和操作系统相关 windows下:sep = \ if sep != '/': ldir = replace(ldir, '/', sep) print ldir if not isdir(ldir): if exists(ldir): return makedirs(ldir) return path def download(self): try: retval = urlretrieve(self.url, self.file) except IOError: retval = ('*** Error URL "%s"' % self.url) return retval def parseAndGetLinks(self): """StringIO是从内存中读取数据 DumbWriter将事件流转换为存文本文档 AbstractFormatter 类进行格式化 """ self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
def create_plaintext_message(self, text): """ Create a plain-text-message by parsing the html and attaching links as endnotes """ plain_text_maxcols = 72 textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter( formatter.DumbWriter(textout, plain_text_maxcols)) parser = HTMLParser(formtext) parser.feed(text) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for counter, item in enumerate(parser.anchorlist): anchorlist += "[%d] %s\n" % (counter, item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist return text
def _create_plaintext_message(self, text): """ Create a plain-text-message by parsing the html and attaching links as endnotes """ plain_text_maxcols = 72 textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter( formatter.DumbWriter(textout, plain_text_maxcols)) parser = HTMLParser(formtext) parser.feed(text) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for counter, item in enumerate(parser.anchorlist): anchorlist += "[{0:d}] {1:s}\n".format(counter, item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist return text
def collectURLSFromPage(page): resultList=[] #print "Doing form parser" if page.count("<form")>0: otherlist=daveFormParse(page) for key in otherlist: resultList.append(key) pass #DEBUG #return resultList #print "Doing RAW Parser" spamList=rawParse(page) for key in spamList: resultList.append(key) pass #the whole "AbstractFormater()" line is a bunch of crap I copied #That needs to be documented somehow, but I have no idea what it does try: parser=HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) parser.feed(page) parser.close() except: #print "DEBUG: Caught an exception trying to parse that html file." #print "(Not sure why this happens - you'll have to crawl this page manually)" return resultList #print "Adding HTML Parser data" for key in parser.anchorlist: resultList.append(key) pass return resultList
def collectURLSFromPage(page): resultList = [] #print "Doing form parser" if page.count("<form") > 0: otherlist = daveFormParse(page) for key in otherlist: resultList.append(key) pass #DEBUG #return resultList #print "Doing RAW Parser" spamList = rawParse(page) for key in spamList: resultList.append(key) pass #the whole "AbstractFormater()" line is a bunch of crap I copied #That needs to be documented somehow, but I have no idea what it does try: parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) parser.feed(page) parser.close() except: #print "DEBUG: Caught an exception trying to parse that html file." #print "(Not sure why this happens - you'll have to crawl this page manually)" return resultList #print "Adding HTML Parser data" for key in parser.anchorlist: resultList.append(key) pass return resultList
class Retriever(object): #download Web pages def __init__(self, url): self.url = url self.file = self.filename(url) def filename(self, url): path=url path = re.sub("\W","_",path) path+=".html" return path def isForbidden(self): return 0; def isForbidden(self): return 0; def download(self): try: if True: retval = urlretrieve(self.url, self.file) javaGroupContent=JavaGroupContent.JavaGroupContent() javaGroupContent.meet_page(self.url, self.file) else: retval = '*** INFO: no need to download ' except IOError: retval = ('*** ERROR: invalid URL "%s"' % self.url,) return retval def parseAndGetLinks(self): self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) try: self.parser.feed(open(self.file).read()) self.parser.close() except IOError: pass return self.parser.anchorlist
class Retriever(object): """docstring for Retriever""" def __init__(self, url):#构造器,指向当前类的当前实例的引用。把URL字符串和从filename()返回的与之对应的文件名保存为本地属性。 #super(Retriever, self).__init_url self.url = url self.file=self.filename(url) def filename(self,url,deffile='index.html'): parsedurl=urlparse(url,'http',0) #定义网页的url ,下载协议方式,是否允许不完整的内容。urlparse分离出来的六个元素分别是(prot_shc,net_loc,path,params,query,frag). path=parsedurl[1]+parsedurl[2] #http://csdn.net/name/articials/details/44444.html 组成文件路径 ext=splitext(path) #分离.前后,文件名与拓展名 if ext[1]=='':#无文件,使用默认 if path[-1]=='/': path+=deffile else: path+='/'+deffile #加载页面路径 ldir=dirname(path) #提取path字符串的目录名称 print ldir if sep!='/': #sep=='\' ldir=replace(ldir,'/',sep) if not isdir(ldir): if exists(ldir):unlink(ldir) makedirs(ldir) #创建目录 return path def download(self): #下载页面 try: cookie_support=urllib2.HTTPCookieProcessor(cookielib.CookieJar()) self.opener=urllib2.build_opener(cookie_support,urllib2.HTTPHandler) urllib2.install_opener(self.opener) user_agent=[ 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 'Opera/9.25 (Windows NT 5.1; U; en)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",] agent=random.choice(user_agent) self.opener.addheaders=[("User-agent",agent),("Accept","*/*"),('Referer','http:www.google.com')] urll=self.opener.open(self.url) html=urll.read() output=open(self.file,'w') output.write(html) output.close() retval=self.url #print retval #retval=urlretrieve(self.url,self.file) return retval except IOError: retval=('*** error:invalid URl "%s"'%self.url,) return retval else: pass finally: pass def parseAndGetLinks(self):#分析页面获得url self.parser=HTMLParser(AbstractFormatter(DumbWriter(StringIO))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist #锚链接列表
class Retriever(object): '''检索并解析每一个下载下来的web网页''' def __init__(self, url): self.url = url self.file = self.filename(url) def filename(self, url, deffile='index.html'): parsedurl = urlparse(url) #返回一个包含6个字符串项目的元组:协议、位置、路径、参数、查询、片段 path = parsedurl.netloc + parsedurl.path ext = splitext(path) #splitext搜索文件路径(path)和文件的扩展名 # (ext),如a.png,ext=('a','png') if ext[1] == '': #no file ,use default #例如www.baidu.com -->www.baidu.com/index.html if path[-1] == '/': path += deffile else: path += '/' + deffile ldir = dirname(path) #dirname 去掉文件名,返回目录 if sep != '/': #os.sep 主要用于系统路径中的分隔符 ldir = replace(ldir, '/', sep) if isdir(ldir): #使用isdir辨别文件类型是不是目录。 if exists(ldir): #unlink(ldir) #unlink 方法用于删除文件,如果文件是一个目录则返回一个错误。 shutil.rmtree(ldir) print '目录 [%s] 已存在,删除目录....' % (ldir) print '创建目录 --> [%s]' % ldir makedirs(ldir) #生成目录 return path def download(self): #下载网页 try: retval = urlretrieve(self.url, self.file, callbackInfo) ''' 参数url:下载链接地址 参数filename:指定了保存本地路径(如果参数未指定,urllib会生成一个临时文件保存数据。) 参数reporthook:是一个回调函数,当连接上服务器、以及相应的数据块传输完毕时会触发该回调,我们可以利用这个回调函数来显示当前的下载进度。 参数data:指post导服务器的数据,该方法返回一个包含两个元素的(filename, headers) 元组,filename 表示保存到本地的路径,header表示服务器的响应头 :return filename, headers ''' except IOError: retval = ('*** ERROR: invald URL "%s"' % (self.url)) return retval def parseAndGetLinks(self): '''解析html页面,获取页面中的链接,并保存链接''' self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) #使用HTMLParser的方法进行处理 , StringIO是从内存中读取数据,DumbWriter将事件流转换为存文本文档。 self.parser.feed(open(self.file).read()) #将self.file文件打开,并一次性读入上面定的文件中 self.parser.close() print 'self.parser.anchorlist --> ', self.parser.anchorlist return self.parser.anchorlist #anchorlist 记录href 地址
class Retriever(object): # download web page def __init__(self,url): self.url = url self.file = self.filename(url) def filename(self,url,deffile='index.html'): parsedurl = urlparse(url,'http',0) # parse path path = parsedurl[1] + parsedurl[2] ext = splitext(path) if ext[1] == '': # no file,use default if path[-1] == '/': path += deffile else: path += '/' + deffile ldir = dirname(path) # local directory if sep != '/': # os-indep. path separator. ldir = repalce(ldir,'/',sep) if not isdir(ldir): # create archieve dir if nec. if exists(ldir): unlink(ldir) makedirs(ldir) return path def download(self): # download web page try: retval = urlretrieve(self.url,self.file) except IOError: retval = ('*** ERROR: invalid URL "%s"' % self.url) return retval def parseAndGetLinks(self): # parse HTML, save links self.parser = HTMLParser(AbstractFormatter(\ DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist class Crawler(object): # manage entire crawling process count = 0 # static download page counter def __init__(self,url): self.q = [url] self.seen = [] self.dom = urlparse(url)[1] def getPage(self,url): r = Retriever(url) retval = r.download() if retval[0] == '*': # error situation, do not parse print retval, '...skipping parse' return Crawler.count += 1 print '\n(', Crawler.count, ')' print 'URL:',url print 'FILE:',retval[0] self.seen.append(url) links = r.parseAndGetLinks() # get and process links for eachlink in links: if eachlink[:4] != 'http' and \ find(eachlink,'://') == -1: eachlink = urljoin(url,eachlink) print '* ',eachlink if find(lower(eachlink),'mailto') != -1: print '...discarded, mailto link' continue if eachlink not in self.seen: if find(eachlink,self.dom) == -1: print '...discarded, not in domain' else: if eachlink not in self.q: self.q.append(eachlink) print '...new, added to Q' else: print '...discarded, already in Q' else: print '...discarded, already processed' def go(self): # process links in queue while self.q: url = self.q.pop() self.getPage(url) def main(): if len(argv) > 1: url = argv[1] else: try: url = raw_input('Enter starting URL: ') except (KeyboardInterrupt,EOFError): url = '' if not url: return robot = Crawler(url) robot.go() if __name__ == '__main__': main()