Esempio n. 1
0
	def get_book_html(self, book_abs):
		sql_str = "select html from book_abs where book_id='%s';" % (book_abs["book_id"])
		
		rows = self.mysql.query(sql_str, ["html"])
		for row in rows:
			return filter_r_and_n(row["html"])
		return False
Esempio n. 2
0
	def prase_book(self, book, html):
		reg_pattern = re.compile(u'<h1>(.*?)</h1>')
		match = reg_pattern.search(html)
		if match:
			book["name"] = match.group(1)
			book["name"] = filter_tags(book["name"])
			book["name"] = filter_r_and_n(book["name"])
		
		reg_pattern = re.compile(u'<div class="PI_info">(.*?)</div>')
		match = reg_pattern.search(html)
		if match:
			book_info_str = match.group(1)
			
			#作者
			reg_pattern = re.compile(u'<h3 class="PI_item">作者(.*?)</h3>')
			match = reg_pattern.search(book_info_str)
			if match:
				book["author"] = match.group(1)
				book["author"] = filter_tags(book["author"])
				book["author"] = filter_r_and_n(book["author"])
				book["author"] = book["author"].replace(" / ", "")
				
			#出版社
			reg_pattern = re.compile(u'<h3 class="PI_item">出版社(.*?)</h3>')
			match = reg_pattern.search(book_info_str)
			if match:
				book["press"] = match.group(1)
				book["press"] = filter_tags(book["press"])
				book["press"] = filter_r_and_n(book["press"])
				book["press"] = book["press"].replace(" / ", "")
				
			#出版日期
			reg_pattern = re.compile(u'<h3 class="PI_item">出版日期(.*?)</h3>')
			match = reg_pattern.search(book_info_str)
			if match:
				book["publictime"] = match.group(1)
				book["publictime"] = filter_tags(book["publictime"])
				book["publictime"] = filter_r_and_n(book["publictime"])
				book["publictime"] = book["publictime"].replace(" / ", "")
			
			#定价
			reg_pattern = re.compile(u'<h3 class="PI_item">定價(.*?)</h3>')
			match = reg_pattern.search(book_info_str)
			if match:
				book["price"] = match.group(1)
				book["price"] = filter_tags(book["price"])
				book["price"] = filter_r_and_n(book["price"])
				book["price"] = book["price"].replace(" / ", "")
			
			#售价
			reg_pattern = re.compile(u'<h3 class="PI_item">售價(.*?)</h3>')
			match = reg_pattern.search(book_info_str)
			if match:
				book["sell_price"] = match.group(1)
				book["sell_price"] = filter_tags(book["sell_price"])
				book["sell_price"] = filter_r_and_n(book["sell_price"])
				book["sell_price"] = book["sell_price"].replace(" / ", "")
			
			#裝訂
			reg_pattern = re.compile(u'class="PI_item">裝訂(.*?)<')
			match = reg_pattern.search(book_info_str)
			if match:
				book["print"] = match.group(1)
				book["print"] = filter_tags(book["print"])
				book["print"] = filter_r_and_n(book["print"])
				book["print"] = book["print"].replace(" / ", "")
				
			#商品語言
			reg_pattern = re.compile(u'class="PI_item">商品語言(.*?)<')
			match = reg_pattern.search(book_info_str)
			if match:
				book["language"] = match.group(1)
				book["language"] = filter_tags(book["language"])
				book["language"] = filter_r_and_n(book["language"])
				book["language"] = book["language"].replace(" / ", "")
		
		#詳細資料
		
		reg_pattern = re.compile(u'<div class="C_box"><h2>詳細資料</h2>(.*?)</div>')
		match = reg_pattern.search(html)
		if match:
			book_info_str = match.group(1)
			book_info_str = filter_tags(book_info_str)
			book_info_str = book_info_str.replace("\t", "")
		
			reg_pattern = re.compile(u'ISBN 13 /(\d+)')
			match = reg_pattern.search(book_info_str)
			if match:
				book["isbn"] = match.group(1)
				
			reg_pattern = re.compile(u'頁數/(\d+)')
			match = reg_pattern.search(book_info_str)
			if match:
				book["pagecnt"] = match.group(1)
				
			
		#目录
		reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_catelog" class="C_box" style="display:none;">(.*?)</div>')
		match = reg_pattern.search(html)
		if match:
			book["menu"] =  filter_tags(match.group(1))
			book["menu"] = book["menu"].replace("本書目錄", "")
		return book
	
		#作者介绍
		reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_all_character" class="C_box" style="display:none;">(.*?)</div>')
		match = reg_pattern.search(html)
		if match:
			book["authordesc"] =  filter_tags(match.group(1))
			book["authordesc"] = book["authordesc"].replace("作者介紹", "")
		
		#内容接受	
		reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_introduction" class="C_box" style="display:block;">(.*?)</div>')
		match = reg_pattern.search(html)
		if match:
			book["desc"] =  filter_tags(match.group(1))
			book["desc"] = book["desc"].replace("內容簡介", "")
		
		#媒体推荐	
		reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_medium" class="C_box" style="display:none;">(.*?)</div>')
		match = reg_pattern.search(html)
		if match:
			book["meidum"] =  filter_tags(match.group(1))
			book["meidum"] = book["meidum"].replace("媒體推薦", "")
		
		#得獎紀錄
		reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_award" class="C_box" style="display:none;">(.*?)</div>')
		match = reg_pattern.search(html)
		if match:
			book["award"] =  filter_tags(match.group(1))
			book["award"] = book["award"].replace("得獎紀錄", "")
		
		return book