def __init__(self): self.snoopy = Snoopy() self.mysql = MySqlUtil(chengpin_db)
class Chengpin(object): def __init__(self): self.snoopy = Snoopy() self.mysql = MySqlUtil(chengpin_db) #设置抓起客户端的编号,为了方便多个进程同时运行 def set_client(self, client_count, client_id): try: self.client_count = int(client_count) self.client_id = int(client_id) except: self.client_count = 1 self.client_id = 1 def init_category_format(self): category = {} category["cate"] = "" category["sub"] = "" category["list"] = "" category["page"] = "1" category["text"] = "" category["status"] = "0" return category #============================================================================== # fetch_categorys : 抓取诚品书店的类别 #============================================================================== def fetch_categorys(self): category_urls = [ "http://www.eslite.com/category.aspx?cate=80", #中文 "http://www.eslite.com/category.aspx?cate=156", #外文 "http://www.eslite.com/category.aspx?cate=44" #儿童 ] categorys = [] for category_url in category_urls: self.snoopy.fetch(category_url) html = self.snoopy.results reg_pattern = re.compile("\r") html = str_repalce(html, reg_pattern, "") reg_pattern = re.compile("\n") html = str_repalce(html, reg_pattern, "") reg_pattern = re.compile(r'<a href="(newbook_list.aspx?.*?)">(.*?)</a>') category_strs = reg_pattern.findall(html) for category_str in category_strs: try: category = self.init_category_format() params = url_decode(category_str[0]) category["cate"] = params["cate"] category["sub"] = params["sub"] category["list"] = params["list"] category["text"] = category_str[1].strip().decode("utf8") categorys.append(category) except: pass return categorys def save_categorys(self, categorys): categorys_str = "" for category in categorys: if categorys_str: categorys_str = "%s , ('%s', '%s', '%s', '%s', '%s', '%s')" % (categorys_str, category["cate"], category["sub"], category["list"], category["page"], category["text"], category["status"]) else: categorys_str = "('%s', '%s', '%s', '%s', '%s', '%s')" % (category["cate"], category["sub"], category["list"], category["page"], category["text"], category["status"]) if categorys_str: sql_str = "INSERT IGNORE category(`cate`, `sub`, `list`, `page`, `text`, `status`) VALUES %s;" % categorys_str print sql_str self.mysql.update(sql_str) def get_category(self): client_count = self.client_count client_id = self.client_id - 1 sql_str = "SELECT `cate`, `sub`, `list`, `page`, `text` FROM `category` WHERE `status`=0 AND id%s%s=%s ORDER BY `id` ASC LIMIT 1;" % ("%", str(client_count), str(client_id)) rows = self.mysql.query(sql_str, ["cate","sub","list","page", "text"]) for row in rows: row["page"] = int(row["page"]) return row return False def update_category_status(self, category, status): sql_str = "update category set `page`='%s', `status`='%s' where `cate`='%s' and `sub`='%s' and `list`='%s';" \ % (category["page"], str(status), category["cate"], category["sub"], category["list"]) print sql_str self.mysql.update(sql_str) #============================================================================== # fetch_book_id : 抓取某个小分类的图书的id #============================================================================== def fetch_books_id(self, category): for page in range(category["page"], 26, 1): category_url = "http://www.eslite.com/newbook_list.aspx?cate=%s&sub=%s&list=%s&page=%s" \ % (category["cate"], category["sub"], category["list"], str(category["page"])) self.snoopy.fetch(category_url) html = self.snoopy.results reg_pattern = re.compile(r'pgid=(\d+)') books_id = reg_pattern.findall(html) books_id = {}.fromkeys(books_id).keys() self.save_books_id(books_id, category) print u"%s %s页:%s" % (category["text"], str(category["page"]), str(len(books_id))) category["page"] = page self.update_category_status(category, 0) if len(books_id) < 10: break self.update_category_status(category, 1) def save_books_id(self, books_id, category): books_str = "" for book_id in books_id: if books_str: books_str = "%s, ('%s', '%s', '%s', '%s', '')" % (books_str, book_id, category["cate"], category["sub"], category["list"]) else: books_str = "('%s', '%s', '%s', '%s', '')" % (book_id, category["cate"], category["sub"], category["list"]) if books_str: sql_str = "INSERT IGNORE book_abs(`book_id`, `cate`, `sub`, `list`, `html`) VALUES %s;" % books_str print sql_str self.mysql.update(sql_str) def get_book_url(self, book_abs): return "" def update_fetch_status_fail(self, name, year): status = 2 sql_str = "UPDATE `publisher` SET `status`=%s WHERE `name`='%s' AND `year`=%s;" % \ (status, self.db.escape_string(name), str(year)) print sql_str self.db.update(sql_str) def get_book_html(self, book_abs): sql_str = "select html from book_abs where book_id='%s';" % (book_abs["book_id"]) rows = self.mysql.query(sql_str, ["html"]) for row in rows: return filter_r_and_n(row["html"]) return False def fetch_book(self, book_abs): book = self.init_book() book["url"] = "http://www.eslite.com/product.aspx?pgid=%s" % book_abs["book_id"] book["book_id"] = book_abs["book_id"] html = self.get_book_html(book_abs) if html: book = self.prase_book(book, html) self.save_book(book) self.update_book_abs_status(book_abs) #============================================================================== # prase_books_abstract : 从html中匹配书简介 #============================================================================== def prase_books_abstract(self, html): books = [] reg_pattern = re.compile(r"makeDetailUrl\(this, '/search/showDocDetails\?', '(.*?)', '(.*?)', '(.*?)'\);") book_matchs = reg_pattern.findall(html) for book_match in book_matchs: book = self.init_book_abstract() book["book_id"] = book_match[0] book["src"] = book_match[1] book["publisher"] = book_match[2] books.append(book) print book["book_id"], book["src"], book["publisher"] return books def prase_book(self, book, html): reg_pattern = re.compile(u'<h1>(.*?)</h1>') match = reg_pattern.search(html) if match: book["name"] = match.group(1) book["name"] = filter_tags(book["name"]) book["name"] = filter_r_and_n(book["name"]) reg_pattern = re.compile(u'<div class="PI_info">(.*?)</div>') match = reg_pattern.search(html) if match: book_info_str = match.group(1) #作者 reg_pattern = re.compile(u'<h3 class="PI_item">作者(.*?)</h3>') match = reg_pattern.search(book_info_str) if match: book["author"] = match.group(1) book["author"] = filter_tags(book["author"]) book["author"] = filter_r_and_n(book["author"]) book["author"] = book["author"].replace(" / ", "") #出版社 reg_pattern = re.compile(u'<h3 class="PI_item">出版社(.*?)</h3>') match = reg_pattern.search(book_info_str) if match: book["press"] = match.group(1) book["press"] = filter_tags(book["press"]) book["press"] = filter_r_and_n(book["press"]) book["press"] = book["press"].replace(" / ", "") #出版日期 reg_pattern = re.compile(u'<h3 class="PI_item">出版日期(.*?)</h3>') match = reg_pattern.search(book_info_str) if match: book["publictime"] = match.group(1) book["publictime"] = filter_tags(book["publictime"]) book["publictime"] = filter_r_and_n(book["publictime"]) book["publictime"] = book["publictime"].replace(" / ", "") #定价 reg_pattern = re.compile(u'<h3 class="PI_item">定價(.*?)</h3>') match = reg_pattern.search(book_info_str) if match: book["price"] = match.group(1) book["price"] = filter_tags(book["price"]) book["price"] = filter_r_and_n(book["price"]) book["price"] = book["price"].replace(" / ", "") #售价 reg_pattern = re.compile(u'<h3 class="PI_item">售價(.*?)</h3>') match = reg_pattern.search(book_info_str) if match: book["sell_price"] = match.group(1) book["sell_price"] = filter_tags(book["sell_price"]) book["sell_price"] = filter_r_and_n(book["sell_price"]) book["sell_price"] = book["sell_price"].replace(" / ", "") #裝訂 reg_pattern = re.compile(u'class="PI_item">裝訂(.*?)<') match = reg_pattern.search(book_info_str) if match: book["print"] = match.group(1) book["print"] = filter_tags(book["print"]) book["print"] = filter_r_and_n(book["print"]) book["print"] = book["print"].replace(" / ", "") #商品語言 reg_pattern = re.compile(u'class="PI_item">商品語言(.*?)<') match = reg_pattern.search(book_info_str) if match: book["language"] = match.group(1) book["language"] = filter_tags(book["language"]) book["language"] = filter_r_and_n(book["language"]) book["language"] = book["language"].replace(" / ", "") #詳細資料 reg_pattern = re.compile(u'<div class="C_box"><h2>詳細資料</h2>(.*?)</div>') match = reg_pattern.search(html) if match: book_info_str = match.group(1) book_info_str = filter_tags(book_info_str) book_info_str = book_info_str.replace("\t", "") reg_pattern = re.compile(u'ISBN 13 /(\d+)') match = reg_pattern.search(book_info_str) if match: book["isbn"] = match.group(1) reg_pattern = re.compile(u'頁數/(\d+)') match = reg_pattern.search(book_info_str) if match: book["pagecnt"] = match.group(1) #目录 reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_catelog" class="C_box" style="display:none;">(.*?)</div>') match = reg_pattern.search(html) if match: book["menu"] = filter_tags(match.group(1)) book["menu"] = book["menu"].replace("本書目錄", "") return book #作者介绍 reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_all_character" class="C_box" style="display:none;">(.*?)</div>') match = reg_pattern.search(html) if match: book["authordesc"] = filter_tags(match.group(1)) book["authordesc"] = book["authordesc"].replace("作者介紹", "") #内容接受 reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_introduction" class="C_box" style="display:block;">(.*?)</div>') match = reg_pattern.search(html) if match: book["desc"] = filter_tags(match.group(1)) book["desc"] = book["desc"].replace("內容簡介", "") #媒体推荐 reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_medium" class="C_box" style="display:none;">(.*?)</div>') match = reg_pattern.search(html) if match: book["meidum"] = filter_tags(match.group(1)) book["meidum"] = book["meidum"].replace("媒體推薦", "") #得獎紀錄 reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_award" class="C_box" style="display:none;">(.*?)</div>') match = reg_pattern.search(html) if match: book["award"] = filter_tags(match.group(1)) book["award"] = book["award"].replace("得獎紀錄", "") return book #============================================================================== # update_fetch_status : 保存抓取进度 #============================================================================== def update_fetch_status(self, name, year, page=1, success=False): if success: status = 1 else: status = 0 sql_str = "UPDATE `publisher` SET `page`=%s, `status`=%s WHERE `name`='%s' AND `year`=%s;" % \ (str(page), status, self.db.escape_string(name), str(year)) print sql_str self.db.update(sql_str) #============================================================================== # get_publister : 获得一个出版社 #============================================================================== def get_publisher(self): client_count = self.client_count client_id = self.client_id - 1 sql_str = "SELECT `id`, `name`, `year`, `page` from `publisher` WHERE `status`=0 AND id%s%s=%s ORDER BY `year` ASC LIMIT 1;" % ("%", str(client_count), str(client_id)) rows = self.db.query(sql_str, ["id","name","year","page"]) for row in rows: print row["id"] publisher = {} publisher["name"] = row["name"] publisher["year"] = int(row["year"]) publisher["page"] = int(row["page"]) return publisher return False def get_publisher_by_id(self, id): sql_str = "SELECT `name`, `year`, `page` from `publisher` WHERE `status`=0 AND `id`=%s;" % (str(id)) rows = self.db.query(sql_str, ["name","year","page"]) for row in rows: publisher = {} publisher["name"] = row["name"] publisher["year"] = int(row["year"]) publisher["page"] = int(row["page"]) return publisher return False def init_publisher(self): return for id in range(27, 2351, 1): publisher = self.get_publisher_by_id(id) year_str = "" for year in range(1971, 2013, 1): if year_str: year_str = "%s, ('%s', %s)" % (year_str, self.db.escape_string(publisher["name"]), str(year)) else: year_str = "('%s', %s)" % (self.db.escape_string(publisher["name"]), str(year)) print id if year_str: sql_str = "INSERT IGNORE publisher(`name`, `year`) VALUES %s;" % (year_str) self.db.update(sql_str) def get_book_abs(self, status): client_count = self.client_count client_id = self.client_id - 1 sql_str = "SELECT book_id FROM `book_abs` WHERE `status`=%s AND id%s%s = %s LIMIT 1;" % (str(status), "%",str(client_count), str(client_id)) rows = self.mysql.query(sql_str, ["book_id"]) for row in rows: return row return False def fetch_book_html(self, book_abs): book_url = "http://www.eslite.com/product.aspx?pgid=%s" % book_abs["book_id"] print book_url self.snoopy.fetch(book_url) html = self.snoopy.results self.save_book_html(book_abs, html) def save_book_html(self, book_abs, html): html = self.mysql.escape_string(html) sql_str = "update book_abs set html='%s', status=1 where book_id='%s';" % (html, book_abs["book_id"]) self.mysql.update(sql_str) def init_book(self): book = {} book["book_id"] = "" #book_id book["isbn"] = "" #isbn book["category"] = "" book["shortCategory"] = "" #分类 book["ztCategory"] = "" #中图分类号 book["name"] = "" #书名 book["author"] = "" #作者 book["authordesc"] = "" #介绍 book["price"] = "" #定价 book["sell_price"] = "" #售价 book["language"] = "" #语言 book["press"] = "" #出版社 book["print"] = "" #装订 book["publictime"] = "" #出版时间 book["pagecnt"] = 0 #页数 book["version"] = 0 #版本 book["printversion"] = 0#版本 book["desc"] = "" #摘要 book["url"] = "" #url book["img"] = "" #图片 book["meidum"] = "" #媒体推荐 book["award"] = "" #得奖记录 book["menu"] = "" #目录 return book def save_book(self, book): for key in book.keys(): print key, book[key] values_str = "" keys_str = "" for key in book.keys(): if book[key]: if values_str: values_str = "%s, '%s'" % (values_str, self.mysql.escape_string(str(book[key]))) keys_str = "%s, `%s`" % (keys_str, key) else: values_str = "'%s'" % (self.mysql.escape_string(str(book[key]))) keys_str = "`%s`" % (key) sql_str = "INSERT IGNORE book(%s) VALUES (%s) on duplicate key update `isbn`='%s';" % (keys_str, values_str, book["isbn"]) print sql_str self.mysql.update(sql_str) def update_book_abs_status(self, book_abs): sql_str = "UPDATE book_abs SET `status`=2 WHERE book_id='%s';" % \ (book_abs["book_id"]) self.mysql.update(sql_str)