def analyse_chapter(self,index): ret = "" page = self.page #Get title title_exp = re.compile("<h1>.+?</h1>",re.I|re.S) m = title_exp.search(page) if m == None: return None title = page[m.start() + 4 : m.end() - 5] ret = title.decode('gbk','ignore').encode('utf-8') out.printstr("Chapter title : " + ret + "\n") ret = "第%i章 "%(index) + ret ret = ret + "<br/>" ret = ret.replace(" "," ") page = page[m.end() :] #Get chapter chapter_exp = re.compile("<div id=\"htmlContent\" class=\"contentbox\">",re.I|re.S) m = chapter_exp.search(page) page = page[m.end() :] chapter_exp = re.compile("<div class=\"ad00\"><script>show_style()",re.I|re.S) m = chapter_exp.search(page) page = page[0 : m.start()] ret = ret + page.decode('gbk','ignore').encode('utf-8') ret = ret + "<br/>" out.printstr("Decoding...") ret = ret.replace(" ",""); ret = html_translate.translate(ret) return ret
def analyse_index(self): ret = "" out.printstr("Analysing index...\n") #Get links url_exp = re.compile("<li><a href=\"\\S+?\">",re.I|re.S) page = self.page while True: m = url_exp.search(page) if m == None: break link = page[m.start() : m.end()] page = page[m.end() :] link = link[13 : -2] self.chapters.append(link) #Get title title_exp = re.compile("articlename='\\S+?'",re.I|re.S) page = self.page m = title_exp.search(page) if m != None: ret = page[m.start() + 13 : m.end() - 1] ret = ret.decode('gbk','ignore').encode('utf-8') out.printstr("Novel title : " + ret + "\n") ret = ret + "<br/>" return html_translate.translate(ret)
def download(self): ''' Private function Download thread. ''' out.printstr("Download thread started.\n") while True: url = self.get_url() if url == None: break #Download url while True: request = urllib2.Request(url[0]) request.add_header('User-Agent', self.agent) try: try: out.printstr("\nGetting %s"%(url[0])) response = urllib2.urlopen(request,timeout = self.timeout) except urllib2.URLError,e: out.printerr(e) out.printstr("Download failed retrying...\n") continue data = response.read() if len(data) == 0: out.printstr("Download failed retrying...\n") continue except Exception: out.printstr("Download failed retrying...\n") continue break
def get_data(self,index): ret = "" next = self.page cc = re.compile("<cc>",re.I|re.S) cc_end = re.compile("</cc>",re.I|re.S) div = re.compile("<div.*?>",re.I|re.S) div_end = re.compile("</div>",re.I|re.S) a = re.compile("<a.*?>",re.I|re.S) a_end = re.compile("</a>",re.I|re.S) #Get data while True: #cc start = cc.search(next) if start == None: break next = next[start.end() + 1 :] #div start = div.search(next) if start != None: next = next[start.end() + 1 :] #/cc end = cc_end.search(next) if end == None: ret = ret + next break ret = ret + "<br>" ret = ret + next[0 : end.start() - 1] next = next[end.end() + 1:] #/div end = div_end.search(ret) if end != None: ret = ret[0 : end.start()] #a place = a.search(ret) if place != None: ret = ret.replace(ret[place.start() : place.end()],"") #/a place = a_end.search(ret) if place != None: ret = ret.replace(ret[place.start() : place.end()],"") out.printstr("Decoding...") ret = html_translate.translate(ret) return ret
def analyse(self): ''' Private function Thread function.It will be called in __init__().Don't call it manually. ''' exec "from analyser.%s import *"%(self.analyser) #Get first page request = urllib2.Request(self.url) request.add_header('User-Agent', self.agent) try: out.printstr("\nGetting %s"%(self.url)) response = urllib2.urlopen(request,timeout=5) except Exception,e: out.printerr(e) self.end = True self.buff_lock.acquire() self.buff_lock.notifyAll() self.buff_lock.release() exit(-1)
downloader.add_url(url) #Analyse pages i = 0 while True: gc.collect() page = downloader.get_data() if page == None: break i = i + 1 analy.analyse_page(page) data = analy.get_data(i) if data == None: i = i - 1 downloader.redownload() out.printstr("Analyzation failed redownload the page.\n") else: downloader.pop() self.write_buf(data) else: #The analyser does not support multi-thread download #Get args try: timeout = int(self.args["t"]) except KeyError: timeout = 5 #Initialize downloader downloader = spider_downloader(1,self.agent,timeout,1) #Analyse pages
def usage(): out.printstr("Usage:") out.printstr( "\tncap.py -u url -a analyser -o output [-t timeout] [-h thread-num] [-b max-buffered-pages] [-g agent] [analyser_parameters]" )
args = arg_scanner(sys.argv) try: analyser = args["a"] url = args["u"] output = args["o"] except KeyError: usage() exit(-1) try: agent = args["g"] except KeyError: pass out.printstr("url = %s\nanalyser = %s\noutput = %s" % (url, analyser, output)) type = sys.getfilesystemencoding() # Call spider try: file = os.open(output.decode("utf-8").encode(type), os.O_CREAT | os.O_RDWR) except Exception: out.printstr("Cannot open output file!") exit(-1) s = spider(url, analyser, args, agent) len = 0 while True: data = s.get_data()
out.printstr("%i bytes downloaded.\n"%(len(data))) #Add downloaded data url[1] = data url[2] = self.free self.list_cond.acquire() self.list_cond.notifyAll() self.list_cond.release() self.thread_num_lock.acquire() self.active_thread = self.active_thread - 1 if self.thread_num > 1: self.thread_num = self.thread_num - 1 self.thread_num_lock.release() out.printstr("Download thread exited.\n") return def get_url(self): ''' Private function Get next url to download. ''' self.list_cond.acquire() i = self.last_url while i < len(self.url_list): if self.url_list[i][1] == None and self.url_list[i][2] == self.free: self.url_list[i][2] = self.downloading self.last_url = i + 1 ret = self.url_list[i] self.list_cond.release()