def _GetContentByKeyword(self, keyword, mode, download_path=None): """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 mode : str or unicode download : 下载 updata :更新图片 Returns ------- success: dict[list]--self.kkmh_content failed : None """ #请求keyword网页 self.keyword = keyword self.download_path = download_path url_keyword = self._url + '/e/search/' keyword_encode = keyword.encode('gbk','strict'); button_encode = "搜索漫画".encode('gbk','strict'); params = { 'key':keyword_encode, 'button':button_encode, } params = parse.urlencode(params).encode("gbk") content_keyword = BaseRequest.PostUrlSoup(url_keyword, params, 'gbk') if content_keyword == None: return False a_result = content_keyword.find_all('p',{'class':'fl cover'}) #取出id关键字,从而访问搜索到的内容 for data in a_result: if mode == "download": #判断此漫画是否已经下载过 sql = "SELECT * FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data.a.img['alt']) if self._EntertainmentSelect(sql): print("%s 已经下载过,请查看数据库" % data.a.img['alt'] ) continue #等待上一部漫画下载完成 while True: if not priority_queue.empty(): print("threads conunt :%d" %threading.active_count()) print("queue size : %d" %(priority_queue.qsize())) if threading.active_count() < 10: StartComicThread(10) time.sleep(60) continue else: break self.keyword = data.a.img['alt'] print(self.keyword) url_keyword_content = self._url + "/" + data.a['href'] soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content, 'gbk') if soup_keyword_content == None: return False #将漫画信息存储到数据库 sql_dict = collections.OrderedDict() sql_dict['Name'] = "\"" + self.keyword + "\"" #名字 sql_dict['WatchNum'] = 0 #编号 sql_dict['Website'] = "\"" + self._url + "\"" #网址 #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环 book = soup_keyword_content.find('div',{'class':'plist pnormal','id':'play_0'}) a_book = [] for data_content in book.ul: a = data_content.find('a') if a != None and a != -1: a_book.append(a) if mode == "download": a_author = soup_keyword_content.find('meta', {'property':'og:novel:author'}) a_category = soup_keyword_content.find('meta', {'property':'og:novel:category'}) a_img = soup_keyword_content.find('meta', {'property':'og:image'}) a_introduce = soup_keyword_content.find('p', {'id':'intro'}) IsFinish = soup_keyword_content.find('meta', {'property':'og:novel:status'}) if (IsFinish['content'] == '连载中'): a_isfinish = 0 else: a_isfinish = 1 #下载漫画封面 for i in range(5): if download_path != None: path = '%s/Comics/%s/' %(download_path, self.keyword) if not BaseRequest.DownloadData(a_img['content'], path, "封面.jpg"): print("download %s failed %d time" % ("封面.jpg", i)) else: print("download %s%s success" % (path,"封面.jpg")) break src = "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Comics/" + self.keyword + "/" + "封面.jpg" #将漫画信息存储到数据库 sql_dict = collections.OrderedDict() sql_dict['Name'] = "\"" + self.keyword + "\"" #名字 sql_dict['WatchNum'] = 0 #编号 sql_dict['Website'] = "\"" + self._url + "\"" #网址 sql_dict['ChapterNum']= len(a_book) #总共有多少章节 sql_dict['IsFinish'] = a_isfinish #是否完结 sql_dict['Introduce'] = "\"" + a_introduce.a.contents[0] + "\"" #漫画介绍 sql_dict['Author'] = "\"" + a_author['content'] + "\"" #作者 sql_dict['Img'] = "\"" + src + "\"" #封面图片 sql_dict['Type'] = "\"" + a_category['content'] + "\"" #漫画类型 sql_dict['Time'] = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 if not self._EntertainmentInsert('ComicName', sql_dict): print("inster ComicName table failed!") continue #获取漫画编号,唯一 sql = "SELECT ID FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data.a.img['alt']) max_id = self._EntertainmentSelect(sql) if max_id: self.id = max_id[0][0] else: print("get max_id failed!") continue elif mode == "update": now_Time = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 sql = "update EntertainmentDB.ComicName set Time = %s where ID = %d;" %(now_Time, self.id) if not self._EntertainmentUpdate(sql): print("%s update failed!" %(sql)) count = 1 for book in reversed(a_book): href = book['href'] title = book['title'] #当前章节的内容插入到队列中 url_a_book = self._url + href data = {"url": url_a_book, "title":title, "href":href, "count": count} if mode == "download": dic_queue = {"type": "download", "subtype": "download", "self":self, "data":data} elif mode == "update": dic_queue = {"type": "download", "subtype": "update", "self":self, "data":data} priority_queue.put(base.Job(2,dic_queue,self._url)) count += 1 return True
def _GetContentByKeyword(self, keyword, mode, download_path=None): """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 mode : str or unicode download : 下载 updata :更新图片 Returns ------- success: dict[list]--self.kkmh_content failed : None """ #请求keyword网页 self.keyword = keyword self.download_path = download_path url_keyword = self._url + '/web/topic/search?keyword' + parse.urlencode({"": keyword}) content_keyword = BaseRequest.GetUrlContent(url_keyword) if content_keyword == None: return False #将返回的内容解析 content_keyword_json = json.loads(content_keyword.decode("utf8")) if content_keyword_json == False: return False #取出id关键字,从而访问搜索到的内容 for data in content_keyword_json['data']['topic']: if mode == "download": #判断此漫画是否已经下载过 sql = "SELECT * FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data['title']) if self._EntertainmentSelect(sql): print(data['title']) continue #等待上一部漫画下载完成 while True: if not priority_queue.empty(): print("threads conunt :%d" %threading.active_count()) print("queue size : %d" %(queue.qsize())) if threading.active_count() < 10: StartComicThread(10) time.sleep(60) continue else: break self.keyword = data['title'] url_keyword_content = self._url + '/web/topic/' + str(data['id']) soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content) if soup_keyword_content == None: return False #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环 a_book = soup_keyword_content.find_all('a',{'class':'article-img'}) if mode == "download": a_author = soup_keyword_content.find('div', {'class':'author-nickname'}) a_introduce = soup_keyword_content.find('div', {'class':'switch-content'}) a_img = soup_keyword_content.find('img', {'class':'kk-img'}) #下载漫画封面 for i in range(5): if download_path != None: path = '%s/Comics/%s/' %(download_path, self.keyword) if not BaseRequest.DownloadData(a_img['src'], path, "封面.jpg"): print("download %s failed %d time" % ("封面.jpg", i)) else: print("download %s%s success" % (path,"封面.jpg")) break src = "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Comics/" + self.keyword + "/" + "封面.jpg" #将漫画信息存储到数据库 sql_dict = collections.OrderedDict() sql_dict['Name'] = "\"" + data['title'] + "\"" #名字 sql_dict['WatchNum'] = 0 #编号 sql_dict['Website'] = "\"" + self._url + "\"" #网址 sql_dict['ChapterNum']= len(a_book) #总共有多少章节 sql_dict['IsFinish'] = 0 #是否完结 sql_dict['Introduce'] = "\"" + a_introduce.p.contents[0].replace('\"', '') + "\"" #漫画介绍 sql_dict['Author'] = "\"" + a_author.contents[0] + "\"" #作者 sql_dict['Img'] = "\"" + src + "\"" #封面图片 sql_dict['Type'] = "\"" + self.type + "\"" #漫画类型 sql_dict['Time'] = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 if not self._EntertainmentInsert('ComicName', sql_dict): print("inster ComicName table failed!") continue #获取漫画编号,唯一 sql = "SELECT ID FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data['title']) max_id = self._EntertainmentSelect(sql) if max_id: self.id = max_id[0][0] else: print("get max_id failed!") continue elif mode == "update": now_Time = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 sql = "update EntertainmentDB.ComicName set Time = %s where ID = %d;" %(now_Time, self.id) if not self._EntertainmentUpdate(sql): print("%s update failed!" %(sql)) count = 1 for book in reversed(a_book): href = book['href'] title = book['title'] src = book.img['src'] #当前章节的内容插入到队列中 url_a_book = self._url + href data = {"url": url_a_book, "title":title, "src": src, "href":href, "count": count} if mode == "download": dic_queue = {"type": "download", "subtype": "download", "self":self, "data":data} elif mode == "update": dic_queue = {"type": "download", "subtype": "update", "self":self, "data":data} priority_queue.put(base.Job(2,dic_queue,self._url)) count += 1 p.spawn(run) p.join() return True
def _GetContentByKeyword(self, keyword, mode, download_path=None): """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 mode : str or unicode download : 下载 updata :更新图片 Returns ------- success: dict[list]--self.kkmh_content failed : None """ #请求keyword网页 self.keyword = keyword self.download_path = download_path url_keyword = self._url + '/search.html' keyword_encode = keyword.encode('big5','strict'); params = { 'keyword':keyword_encode, 'searchtype':'all', } params = parse.urlencode(params).encode("big5") content_keyword = BaseRequest.PostUrlSoup(url_keyword, params, 'big5') if content_keyword == None: return False a_result = content_keyword.find_all('span',{'class':'covertxt'}) #取出id关键字,从而访问搜索到的内容 for data in a_result: data_next_siblings = data.find_next_siblings() if mode == "download": #判断此漫画是否已经下载过 sql = "SELECT * FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data_next_siblings[0]['title']) if self._EntertainmentSelect(sql): print(data_next_siblings[0]['title']) continue #等待上一部漫画下载完成 while True: if not priority_queue.empty(): print("threads conunt :%d" %threading.active_count()) print("queue size : %d" %(priority_queue.qsize())) if threading.active_count() < 10: StartComicThread(10) time.sleep(60) continue else: break self.keyword = data_next_siblings[0]['title'] print(self.keyword) url_keyword_content = self._url + "/" + data_next_siblings[0]['href'] soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content, 'big5') if soup_keyword_content == None: return False #将漫画信息存储到数据库 sql_dict = collections.OrderedDict() sql_dict['Name'] = "\"" + self.keyword + "\"" #名字 sql_dict['WatchNum'] = 0 #编号 sql_dict['Website'] = "\"" + self._url + "\"" #网址 save_content = soup_keyword_content.find_all('td',{'width':276}) if save_content == None: return False sql_dict['Type'] = "\"" + save_content[1].a.contents[0].strip() + "\"" sql_dict['Author'] = "\"" + save_content[3].contents[1].strip() + "\"" a_IsFinish = 0 if save_content[5].contents[4]['src'].strip() == "/image/chap1.gif": sql_dict['IsFinish'] = 0 elif save_content[5].contents[4]['src'].strip() == "/image/chap9.gif": sql_dict['IsFinish'] = 1 else: sql_dict['IsFinish'] = 0 save_content = soup_keyword_content.find_all('table',{'width':688,'cellspacing':"8"}) if save_content != None: sql_dict['Introduce'] = "\"" + save_content[0].tr.td.contents[0].strip() + "\"" else: sql_dict['Introduce'] = '' save_content = soup_keyword_content.find_all('img',{'width':'240','height':'320'}) a_img = '' if save_content != None: a_img = self._url + save_content[0]['src'] #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环 save_content = soup_keyword_content.find_all('table',{'width':'688', 'align':'center'}) if save_content == None: return False a_book = [] for data_content in save_content[0].tbody: for data_td in data_content: a = data_td.find('a') if a != None and a != -1: a_book.append(a) if mode == "download": #下载漫画封面 for i in range(5): if download_path != None: path = '%s/Comics/%s/' %(download_path, self.keyword) if not BaseRequest.DownloadData(a_img, path, "封面.jpg"): print("download %s failed %d time" % ("封面.jpg", i)) else: print("download %s%s success" % (path,"封面.jpg")) break src = "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Comics/" + self.keyword + "/" + "封面.jpg" #将漫画信息存储到数据库 sql_dict['Img'] = "\"" + src + "\"" sql_dict['ChapterNum']= len(a_book) #总共有多少章节 sql_dict['Time'] = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 if not self._EntertainmentInsert('ComicName', sql_dict): print("inster ComicName table failed!") continue #获取漫画编号,唯一 sql = "SELECT ID FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(self.keyword) max_id = self._EntertainmentSelect(sql) if max_id: self.id = max_id[0][0] else: print("get max_id failed!") continue elif mode == "update": now_Time = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 sql = "update EntertainmentDB.ComicName set Time = %s where ID = %d;" %(now_Time, self.id) if not self._EntertainmentUpdate(sql): print("%s update failed!" %(sql)) count = 1 for book in (a_book): href = book['href'] title = book.contents[0] #当前章节的内容插入到队列中 url_a_book = self._url + href data = {"url": url_a_book, "title":title, "href":href, "count": count} if mode == "download": dic_queue = {"type": "download", "subtype": "download", "self":self, "data":data} elif mode == "update": dic_queue = {"type": "download", "subtype": "update", "self":self, "data":data} priority_queue.put(base.Job(2,dic_queue,self._url)) count += 1 return True
def _GetContentByKeyword(self, keyword, mode, download_path=None): """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 mode : str or unicode download : 下载 updata :更新图片 Returns ------- success: dict[list]--self.kkmh_content failed : None """ #请求keyword网页 self.keyword = keyword self.download_path = download_path url_keyword = self._url + '/modules/article/soshu.php?searchkey=' + parse.quote( keyword, encoding='gbk', errors='replace') content_keyword = BaseRequest.GetUrlSoup(url_keyword, 'gbk') if content_keyword == None: return False #将返回的内容解析 find_result = [] if content_keyword.find('caption'): a_result = content_keyword.find_all('tr', {'id': 'nr'}) if a_result == None: return False for result in a_result: find_result.append({ "name": result.td.a.contents[0], "url": result.td.a['href'] }) else: a_url = content_keyword.find('meta', {'property': 'og:url'}) if a_url == None: return False a_name = content_keyword.find('meta', {'property': 'og:novel:book_name'}) if a_name == None: return False find_result.append({ "name": a_name["content"], "url": a_url['content'] }) for result in find_result: if mode == "download": #判断此漫画是否已经下载过 sql = "SELECT * FROM EntertainmentDB.tbl_fiction_name WHERE name=\"%s\";" % ( result["name"]) if self._EntertainmentSelect(sql): print(result["name"]) continue #等待上一部小说下载完成 while True: if not priority_queue.empty(): print("threads conunt :%d" % threading.active_count()) print("queue size : %d" % (priority_queue.qsize())) if threading.active_count() < 10: StartFictionThread(10) time.sleep(60) continue else: break self.keyword = result["name"] soup_keyword_content = BaseRequest.GetUrlSoup( result["url"], 'gbk') if soup_keyword_content == None: return False a_name = soup_keyword_content.find( 'meta', {'property': 'og:novel:book_name'}) a_introduce = soup_keyword_content.find( 'meta', {'property': 'og:description'}) a_image = soup_keyword_content.find('meta', {'property': 'og:image'}) a_category = soup_keyword_content.find( 'meta', {'property': 'og:novel:category'}) a_author = soup_keyword_content.find( 'meta', {'property': 'og:novel:author'}) a_url = soup_keyword_content.find( 'meta', {'property': 'og:novel:read_url'}) a_status = soup_keyword_content.find( 'meta', {'property': 'og:novel:status'}) a_list = soup_keyword_content.find('div', {'id': 'list'}) a_book = a_list.dl.find_all('dd') #下载封面图片 for i in range(5): if download_path != None: path = '%s/Fiction/%s/' % (download_path, self.keyword) if not BaseRequest.DownloadData( a_image['content'], path, "封面.jpg"): print("download %s failed %d time" % ("封面.jpg", i)) else: print("download %s%s success" % (path, "封面.jpg")) break src = "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Fiction/" + self.keyword + "/" + "封面.jpg" #将漫画信息存储到数据库 sql_dict = collections.OrderedDict() sql_dict['name'] = "\"" + a_name['content'] + "\"" #名字 sql_dict['watch_count'] = 0 #编号 sql_dict['website'] = "\"" + self._url + "\"" #网址 sql_dict['chapter_count'] = len(a_book) #总共有多少章节 sql_dict[ 'introduce'] = "\"" + a_introduce['content'] + "\"" #漫画介绍 sql_dict['author'] = "\"" + a_author['content'] + "\"" #作者 sql_dict['cover_img_src'] = "\"" + src + "\"" #封面图片 sql_dict['type'] = "\"" + a_category['content'] + "\"" #漫画类型 sql_dict['add_time'] = "\"" + time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 if "连载中" in a_status['content']: sql_dict['is_finish'] = 0 #是否完结 else: sql_dict['is_finish'] = 1 if not self._EntertainmentInsert('tbl_fiction_name', sql_dict): print("inster tbl_fiction_name table failed!") continue #获取漫画编号,唯一 sql = "SELECT ID FROM EntertainmentDB.tbl_fiction_name WHERE name=\"%s\";" % ( a_name['content']) max_id = self._EntertainmentSelect(sql) if max_id: self.id = max_id[0][0] else: print("get max_id failed!") continue elif mode == "update": now_Time = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 sql = "update EntertainmentDB.tbl_fiction_name set add_time = %s where pk_id = %d;" % ( now_Time, self.id) if not self._EntertainmentUpdate(sql): print("%s update failed!" % (sql)) count = 1 for book in a_book: href = book.a['href'] title = book.a.contents[0] #当前章节的内容插入到队列中 url_a_book = self._url + href data = { "ID": self.id, "url": url_a_book, "title": title, "href": href, "count": count } if mode == "download": dic_queue = { "type": "download", "subtype": "download", "self": self, "data": data } elif mode == "update": dic_queue = { "type": "download", "subtype": "update", "self": self, "data": data } priority_queue.put(base.Job(2, dic_queue, self._url)) count += 1 return True