def crawling(self): curl = Curl() curl.set_url(self.url) with open(self.filname,"wb") as output: curl.set_option(pycurl.WRITEFUNCTION,output.write) curl.get() curl.close()
def crawling(self): curl = Curl() curl.set_url(self.url) with open(self.filname, "wb") as output: curl.set_option(pycurl.WRITEFUNCTION, output.write) curl.get() curl.close()
def saveFile2Local(self,url): self.getFileNameByUrl(url) if self.filename: with open(self.filename,"wb") as output: curl = Curl() curl.set_url(url) curl.set_option(pycurl.WRITEFUNCTION,output.write) curl.get() curl.close() Log4Spider.downLog(self,"downloaded a file:[[[",self.filename,"]]]")
if self.infos == 5 and 'beliked' not in self.info.keys(): self.info['beliked'] = int(data) def handle_endtag(self, tag): if tag == "h3": self.h3 = 0 if self.clearfix and tag == "ul": self.clearfix = 0 if hasattr(self, "infoHook"): self.infoHook(self.info) def handle_startendtag(self, tag, attrs): pass @property def urlList(self): return self.current_urlList() if __name__ == "__main__": parser = JianShuUserInfo_HtmlParser() from curl import Curl import pycurl c = Curl() c.set_url("http://www.jianshu.com/users/d9edcb44e2f2/latest_articles") data = c.get() #parser.setParseFile("parse.txt") parser.setInfoHook(lambda info: print(str(info))) parser.feed(data.decode("utf-8")) parser.close() c.close()
__author__ = 'zhangxa' from curl import Curl import pycurl from html.parser import HTMLParser from htmlParser.htmlParser import UrlHtmlParser from download.downFile import DownFile from urlHandler.urlHandler import UrlBaseHandler from urlQueue.urlQueue import UrlQueue start_url = "http://www.pcgames.com.cn/" c = Curl() c.set_url(start_url) data = c.get() info = c.info() #print(info) def get_charset(c_type): charset = None try: if c_type and 'charset' in c_type: start = c_type.find('charset=') charset_str = c_type[start:] end = charset_str.find(' ') if end > -1: charset = charset_str[len('charset='):end] else: charset = charset_str[len('charset='):] except:
__author__ = 'zhangxa' from curl import Curl import pycurl from html.parser import HTMLParser from htmlParser.htmlParser import UrlHtmlParser from download.downFile import DownFile from urlHandler.urlHandler import UrlBaseHandler from urlQueue.urlQueue import UrlQueue start_url = "http://www.pcgames.com.cn/" c = Curl() c.set_url(start_url) data = c.get() info = c.info() #print(info) def get_charset(c_type): charset=None try: if c_type and 'charset' in c_type: start = c_type.find('charset=') charset_str = c_type[start:] end = charset_str.find(' ') if end > -1: charset = charset_str[len('charset='):end] else: charset = charset_str[len('charset='):] except: return 'UTF-8'