コード例 #1
0
ファイル: ybdu_com.py プロジェクト: darknightghost/ncap
	def analyse_chapter(self,index):
		ret = ""
		page = self.page
		
		#Get title
		title_exp = re.compile("<h1>.+?</h1>",re.I|re.S)
		m = title_exp.search(page)
		if m == None:
			return None
		title = page[m.start() + 4 : m.end() - 5]
		ret = title.decode('gbk','ignore').encode('utf-8')
		out.printstr("Chapter title : " + ret + "\n")
		ret = "第%i章 "%(index) + ret
		ret = ret + "<br/>"
		ret = ret.replace(" ","&nbsp;")
		page = page[m.end() :]
		
		#Get chapter
		chapter_exp = re.compile("<div id=\"htmlContent\" class=\"contentbox\">",re.I|re.S)
		m = chapter_exp.search(page)
		page = page[m.end() :]
		chapter_exp = re.compile("<div class=\"ad00\"><script>show_style()",re.I|re.S)
		m = chapter_exp.search(page)
		page = page[0 : m.start()]
		ret = ret + page.decode('gbk','ignore').encode('utf-8')
		ret = ret + "<br/>"

		out.printstr("Decoding...")
		ret = ret.replace("&nbsp;&nbsp;&nbsp;&nbsp;","");
		ret = html_translate.translate(ret)
		
		return ret
コード例 #2
0
ファイル: ybdu_com.py プロジェクト: darknightghost/ncap
	def analyse_index(self):
		ret = ""
		
		out.printstr("Analysing index...\n")
		
		#Get links
		url_exp = re.compile("<li><a href=\"\\S+?\">",re.I|re.S)
		page = self.page
		
		while True:
			m = url_exp.search(page)
			if m == None:
				break
			link = page[m.start() : m.end()]
			page = page[m.end() :]
			link = link[13 : -2]
			self.chapters.append(link)
		
		#Get title
		title_exp = re.compile("articlename='\\S+?'",re.I|re.S)
		page = self.page
		m = title_exp.search(page)
		if m != None:
			ret = page[m.start() + 13 : m.end() - 1]
			ret = ret.decode('gbk','ignore').encode('utf-8')
			out.printstr("Novel title : " + ret + "\n")
			ret = ret + "<br/>"
		
		return html_translate.translate(ret)
コード例 #3
0
	def download(self):
		'''
		Private function
		  Download thread.
		'''
		out.printstr("Download thread started.\n")
		while True:
			url = self.get_url()
			if url == None:
				break

			#Download url
			while True:
				request = urllib2.Request(url[0])
				request.add_header('User-Agent', self.agent)
				try:
					try:
						out.printstr("\nGetting %s"%(url[0]))
						response = urllib2.urlopen(request,timeout = self.timeout)
					except urllib2.URLError,e:
						out.printerr(e)
						out.printstr("Download failed retrying...\n")
						continue
					data = response.read()
					if len(data) == 0:
						out.printstr("Download failed retrying...\n")
						continue
				except Exception:
					out.printstr("Download failed retrying...\n")
					continue
				break
コード例 #4
0
ファイル: tieba.py プロジェクト: darknightghost/ncap
	def	get_data(self,index):
		ret = ""
		next = self.page
		cc = re.compile("<cc>",re.I|re.S)
		cc_end = re.compile("</cc>",re.I|re.S)
		div = re.compile("<div.*?>",re.I|re.S)
		div_end = re.compile("</div>",re.I|re.S)
		a = re.compile("<a.*?>",re.I|re.S)
		a_end = re.compile("</a>",re.I|re.S)
		
		#Get data
		while True:
            #cc
			start = cc.search(next)
			if start == None:
				break
			next = next[start.end() + 1 :]
            #div
			start = div.search(next)
			if start != None:
				next = next[start.end() + 1 :]

			#/cc
			end = cc_end.search(next)
			if end == None:
				ret = ret + next
				break
			ret = ret + "<br>"
			ret = ret + next[0 : end.start() - 1]
			next = next[end.end() + 1:]

			#/div
			end = div_end.search(ret)
			if end != None:
				ret = ret[0 : end.start()]
				
			#a
			place = a.search(ret)
			if place != None:
				ret = ret.replace(ret[place.start() : place.end()],"")
			
			#/a
			place = a_end.search(ret)
			if place != None:
				ret = ret.replace(ret[place.start() : place.end()],"")
		out.printstr("Decoding...")
		ret = html_translate.translate(ret)
		return ret
コード例 #5
0
ファイル: spider.py プロジェクト: darknightghost/ncap
	def analyse(self):
		'''
		Private function
		  Thread function.It will be called in __init__().Don't call it manually.
		'''
		exec "from analyser.%s import *"%(self.analyser)

		#Get first page
		request = urllib2.Request(self.url)
		request.add_header('User-Agent', self.agent)
		try:
			out.printstr("\nGetting %s"%(self.url))
			response = urllib2.urlopen(request,timeout=5)
		except Exception,e:
			out.printerr(e)
			self.end = True
			self.buff_lock.acquire()
			self.buff_lock.notifyAll()
			self.buff_lock.release()
			exit(-1)
コード例 #6
0
ファイル: spider.py プロジェクト: darknightghost/ncap
					downloader.add_url(url)

			#Analyse pages
			i = 0
			while True:
				gc.collect()
				page = downloader.get_data()
				if page == None:
					break
				i = i + 1
				analy.analyse_page(page)
				data = analy.get_data(i)
				if data == None:
					i = i - 1
					downloader.redownload()
					out.printstr("Analyzation failed redownload the page.\n")
				else:
					downloader.pop()
					self.write_buf(data)
		else:
			#The analyser does not support multi-thread download
			#Get args
			try:
				timeout = int(self.args["t"])
			except KeyError:
				timeout = 5

			#Initialize downloader
			downloader = spider_downloader(1,self.agent,timeout,1)

			#Analyse pages
コード例 #7
0
ファイル: ncap.py プロジェクト: darknightghost/ncap
def usage():
    out.printstr("Usage:")
    out.printstr(
        "\tncap.py -u url -a analyser -o output [-t timeout] [-h thread-num] [-b max-buffered-pages] [-g agent] [analyser_parameters]"
    )
コード例 #8
0
ファイル: ncap.py プロジェクト: darknightghost/ncap
args = arg_scanner(sys.argv)

try:
    analyser = args["a"]
    url = args["u"]
    output = args["o"]
except KeyError:
    usage()
    exit(-1)

try:
    agent = args["g"]
except KeyError:
    pass

out.printstr("url = %s\nanalyser = %s\noutput = %s" % (url, analyser, output))

type = sys.getfilesystemencoding()
# Call spider
try:
    file = os.open(output.decode("utf-8").encode(type), os.O_CREAT | os.O_RDWR)
except Exception:
    out.printstr("Cannot open output file!")
    exit(-1)

s = spider(url, analyser, args, agent)

len = 0

while True:
    data = s.get_data()
コード例 #9
0
			out.printstr("%i bytes downloaded.\n"%(len(data)))

			#Add downloaded data
			
			url[1] = data
			url[2] = self.free

			self.list_cond.acquire()
			self.list_cond.notifyAll()
			self.list_cond.release()
		self.thread_num_lock.acquire()
		self.active_thread = self.active_thread - 1
		if self.thread_num > 1:
			self.thread_num = self.thread_num - 1
		self.thread_num_lock.release()
		out.printstr("Download thread exited.\n")
		return

	def get_url(self):
		'''
		Private function
		  Get next url to download.
		'''
		self.list_cond.acquire()
		i = self.last_url
		while i < len(self.url_list):
			if self.url_list[i][1] == None and self.url_list[i][2] == self.free:
				self.url_list[i][2] = self.downloading
				self.last_url = i + 1
				ret = self.url_list[i]
				self.list_cond.release()