def GetComicByKeyword(self): """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 url : str or unicode 要从那个网址下载 download_path : str or unicode 文件要保存的何处,默认为None mode : str or unicode download : 下载 updata :更新图片 Returns ------- success: dict[list]--self.kkmh_content failed : None """ download_lst = [ { "name": '西行纪', "url": "http://www.kuaikanmanhua.com", "download": "/mnt/TecentCloud" }, ] print(download_lst) for data in download_lst: while True: if not priority_queue.empty(): print("threads conunt :%d" % threading.active_count()) print("queue size : %d" % (priority_queue.qsize())) if threading.active_count() < 10: StartComicThread(10) time.sleep(60) continue else: break print("%s start download" % (data['name'])) print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) #生成漫画类句柄 self.GetComicHandle(data['url']) #设置网址等信息 self.ComicHandle._set_info(data['url'], None, None) #通过关键字下载 if not self.ComicHandle._GetContentByKeyword( data['name'], "download", data['download']): print("Download %s failed!" % (data['name']))
def UpdateComicPicture(self, download_path=None): """图片地址有时会失效,故需要更新图片 Parameters ---------- download_path : str or unicode 文件要保存的何处,默认为None Returns ------- success: dict[list]--self.kkmh_content failed : None """ #查找所有未完结的漫画 results = True #获取当前数据库里的所有漫画 ComicHandle = EntertainmentSpider() sql = "SELECT * FROM EntertainmentDB.ComicName;" results_tup = ComicHandle._ComicSelect(sql) for result in results_tup: keyword = result[1] url = result[5] #生成漫画类句柄 self.GetComicHandle(url) #漫画ID self.ComicHandle.id = result[0] #设置网址等信息 self.ComicHandle._set_info(url, None, None) #如果队列不为空,则说明当前漫画还没有处理完成,等待完成后在下载下一部漫画 while True: if not priority_queue.empty(): print("queue size : %d" %(priority_queue.qsize())) time.sleep(5) continue else: break #更新数据库 if not self.ComicHandle._GetContentByKeyword(keyword, "update"): print("Download %s failed!" %(keyword)) result = False return results
#启动线程 print("begin StartThread") StartFictionThread(20) #下载模式 if sys.argv[2] == "DownloadAll": EntertainmentAPi.GetFictionByKeyword() #更新模式,增加新的章节 elif sys.argv[2] == "UpdateChapter": EntertainmentAPi.UpdateFictionChapter("/mnt/TecentCloud") while True: if not priority_queue.empty(): print("threads conunt :%d" %threading.active_count()) print("queue size : %d" %(priority_queue.qsize())) time.sleep(5) continue else: break #关闭所有线程 #for t in threads: # t.join() print('finish: ', now() - start) print("download finish") #if not EntertainmentAPi.SaveToDatabase('/home/txz/download',dct_img_book): # print("download failed!") #EntertainmentAPi.ParseContent(content)
def _GetContentByKeyword(self, keyword, mode, download_path=None): """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 mode : str or unicode download : 下载 updata :更新图片 Returns ------- success: dict[list]--self.kkmh_content failed : None """ #请求keyword网页 self.keyword = keyword self.download_path = download_path url_keyword = self._url + '/e/search/' keyword_encode = keyword.encode('gbk','strict'); button_encode = "搜索漫画".encode('gbk','strict'); params = { 'key':keyword_encode, 'button':button_encode, } params = parse.urlencode(params).encode("gbk") content_keyword = BaseRequest.PostUrlSoup(url_keyword, params, 'gbk') if content_keyword == None: return False a_result = content_keyword.find_all('p',{'class':'fl cover'}) #取出id关键字,从而访问搜索到的内容 for data in a_result: if mode == "download": #判断此漫画是否已经下载过 sql = "SELECT * FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data.a.img['alt']) if self._EntertainmentSelect(sql): print("%s 已经下载过,请查看数据库" % data.a.img['alt'] ) continue #等待上一部漫画下载完成 while True: if not priority_queue.empty(): print("threads conunt :%d" %threading.active_count()) print("queue size : %d" %(priority_queue.qsize())) if threading.active_count() < 10: StartComicThread(10) time.sleep(60) continue else: break self.keyword = data.a.img['alt'] print(self.keyword) url_keyword_content = self._url + "/" + data.a['href'] soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content, 'gbk') if soup_keyword_content == None: return False #将漫画信息存储到数据库 sql_dict = collections.OrderedDict() sql_dict['Name'] = "\"" + self.keyword + "\"" #名字 sql_dict['WatchNum'] = 0 #编号 sql_dict['Website'] = "\"" + self._url + "\"" #网址 #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环 book = soup_keyword_content.find('div',{'class':'plist pnormal','id':'play_0'}) a_book = [] for data_content in book.ul: a = data_content.find('a') if a != None and a != -1: a_book.append(a) if mode == "download": a_author = soup_keyword_content.find('meta', {'property':'og:novel:author'}) a_category = soup_keyword_content.find('meta', {'property':'og:novel:category'}) a_img = soup_keyword_content.find('meta', {'property':'og:image'}) a_introduce = soup_keyword_content.find('p', {'id':'intro'}) IsFinish = soup_keyword_content.find('meta', {'property':'og:novel:status'}) if (IsFinish['content'] == '连载中'): a_isfinish = 0 else: a_isfinish = 1 #下载漫画封面 for i in range(5): if download_path != None: path = '%s/Comics/%s/' %(download_path, self.keyword) if not BaseRequest.DownloadData(a_img['content'], path, "封面.jpg"): print("download %s failed %d time" % ("封面.jpg", i)) else: print("download %s%s success" % (path,"封面.jpg")) break src = "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Comics/" + self.keyword + "/" + "封面.jpg" #将漫画信息存储到数据库 sql_dict = collections.OrderedDict() sql_dict['Name'] = "\"" + self.keyword + "\"" #名字 sql_dict['WatchNum'] = 0 #编号 sql_dict['Website'] = "\"" + self._url + "\"" #网址 sql_dict['ChapterNum']= len(a_book) #总共有多少章节 sql_dict['IsFinish'] = a_isfinish #是否完结 sql_dict['Introduce'] = "\"" + a_introduce.a.contents[0] + "\"" #漫画介绍 sql_dict['Author'] = "\"" + a_author['content'] + "\"" #作者 sql_dict['Img'] = "\"" + src + "\"" #封面图片 sql_dict['Type'] = "\"" + a_category['content'] + "\"" #漫画类型 sql_dict['Time'] = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 if not self._EntertainmentInsert('ComicName', sql_dict): print("inster ComicName table failed!") continue #获取漫画编号,唯一 sql = "SELECT ID FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data.a.img['alt']) max_id = self._EntertainmentSelect(sql) if max_id: self.id = max_id[0][0] else: print("get max_id failed!") continue elif mode == "update": now_Time = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 sql = "update EntertainmentDB.ComicName set Time = %s where ID = %d;" %(now_Time, self.id) if not self._EntertainmentUpdate(sql): print("%s update failed!" %(sql)) count = 1 for book in reversed(a_book): href = book['href'] title = book['title'] #当前章节的内容插入到队列中 url_a_book = self._url + href data = {"url": url_a_book, "title":title, "href":href, "count": count} if mode == "download": dic_queue = {"type": "download", "subtype": "download", "self":self, "data":data} elif mode == "update": dic_queue = {"type": "download", "subtype": "update", "self":self, "data":data} priority_queue.put(base.Job(2,dic_queue,self._url)) count += 1 return True
def _GetContentByKeyword(self, keyword, mode, download_path=None): """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 mode : str or unicode download : 下载 updata :更新图片 Returns ------- success: dict[list]--self.kkmh_content failed : None """ #请求keyword网页 self.keyword = keyword self.download_path = download_path url_keyword = self._url + '/search.html' keyword_encode = keyword.encode('big5','strict'); params = { 'keyword':keyword_encode, 'searchtype':'all', } params = parse.urlencode(params).encode("big5") content_keyword = BaseRequest.PostUrlSoup(url_keyword, params, 'big5') if content_keyword == None: return False a_result = content_keyword.find_all('span',{'class':'covertxt'}) #取出id关键字,从而访问搜索到的内容 for data in a_result: data_next_siblings = data.find_next_siblings() if mode == "download": #判断此漫画是否已经下载过 sql = "SELECT * FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data_next_siblings[0]['title']) if self._EntertainmentSelect(sql): print(data_next_siblings[0]['title']) continue #等待上一部漫画下载完成 while True: if not priority_queue.empty(): print("threads conunt :%d" %threading.active_count()) print("queue size : %d" %(priority_queue.qsize())) if threading.active_count() < 10: StartComicThread(10) time.sleep(60) continue else: break self.keyword = data_next_siblings[0]['title'] print(self.keyword) url_keyword_content = self._url + "/" + data_next_siblings[0]['href'] soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content, 'big5') if soup_keyword_content == None: return False #将漫画信息存储到数据库 sql_dict = collections.OrderedDict() sql_dict['Name'] = "\"" + self.keyword + "\"" #名字 sql_dict['WatchNum'] = 0 #编号 sql_dict['Website'] = "\"" + self._url + "\"" #网址 save_content = soup_keyword_content.find_all('td',{'width':276}) if save_content == None: return False sql_dict['Type'] = "\"" + save_content[1].a.contents[0].strip() + "\"" sql_dict['Author'] = "\"" + save_content[3].contents[1].strip() + "\"" a_IsFinish = 0 if save_content[5].contents[4]['src'].strip() == "/image/chap1.gif": sql_dict['IsFinish'] = 0 elif save_content[5].contents[4]['src'].strip() == "/image/chap9.gif": sql_dict['IsFinish'] = 1 else: sql_dict['IsFinish'] = 0 save_content = soup_keyword_content.find_all('table',{'width':688,'cellspacing':"8"}) if save_content != None: sql_dict['Introduce'] = "\"" + save_content[0].tr.td.contents[0].strip() + "\"" else: sql_dict['Introduce'] = '' save_content = soup_keyword_content.find_all('img',{'width':'240','height':'320'}) a_img = '' if save_content != None: a_img = self._url + save_content[0]['src'] #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环 save_content = soup_keyword_content.find_all('table',{'width':'688', 'align':'center'}) if save_content == None: return False a_book = [] for data_content in save_content[0].tbody: for data_td in data_content: a = data_td.find('a') if a != None and a != -1: a_book.append(a) if mode == "download": #下载漫画封面 for i in range(5): if download_path != None: path = '%s/Comics/%s/' %(download_path, self.keyword) if not BaseRequest.DownloadData(a_img, path, "封面.jpg"): print("download %s failed %d time" % ("封面.jpg", i)) else: print("download %s%s success" % (path,"封面.jpg")) break src = "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Comics/" + self.keyword + "/" + "封面.jpg" #将漫画信息存储到数据库 sql_dict['Img'] = "\"" + src + "\"" sql_dict['ChapterNum']= len(a_book) #总共有多少章节 sql_dict['Time'] = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 if not self._EntertainmentInsert('ComicName', sql_dict): print("inster ComicName table failed!") continue #获取漫画编号,唯一 sql = "SELECT ID FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(self.keyword) max_id = self._EntertainmentSelect(sql) if max_id: self.id = max_id[0][0] else: print("get max_id failed!") continue elif mode == "update": now_Time = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 sql = "update EntertainmentDB.ComicName set Time = %s where ID = %d;" %(now_Time, self.id) if not self._EntertainmentUpdate(sql): print("%s update failed!" %(sql)) count = 1 for book in (a_book): href = book['href'] title = book.contents[0] #当前章节的内容插入到队列中 url_a_book = self._url + href data = {"url": url_a_book, "title":title, "href":href, "count": count} if mode == "download": dic_queue = {"type": "download", "subtype": "download", "self":self, "data":data} elif mode == "update": dic_queue = {"type": "download", "subtype": "update", "self":self, "data":data} priority_queue.put(base.Job(2,dic_queue,self._url)) count += 1 return True
elif sys.argv[1] == "Fiction": #启动线程 print("begin StartThread") StartFictionThread(20) #下载模式 if sys.argv[2] == "DownloadAll": EntertainmentAPi.GetFictionByKeyword() #更新模式,增加新的章节 elif sys.argv[2] == "UpdateChapter": EntertainmentAPi.UpdateFictionChapter("/mnt/TecentCloud") if sys.argv[2] == "DownloadAll": num = priority_queue.qsize() size = 0 count = 1 processpool = [] task = [] for count in range(num + 1): if not priority_queue.empty(): task.append(priority_queue.get()) if (count % 115 == 0 or count == num) and count != 0: print("size:", size) process = multiprocessing.Process(target=ComicProcesses, args=(task, ))
def _GetContentByKeyword(self, keyword, mode, download_path=None): """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 mode : str or unicode download : 下载 updata :更新图片 Returns ------- success: dict[list]--self.kkmh_content failed : None """ #请求keyword网页 self.keyword = keyword self.download_path = download_path url_keyword = self._url + '/modules/article/soshu.php?searchkey=' + parse.quote( keyword, encoding='gbk', errors='replace') content_keyword = BaseRequest.GetUrlSoup(url_keyword, 'gbk') if content_keyword == None: return False #将返回的内容解析 find_result = [] if content_keyword.find('caption'): a_result = content_keyword.find_all('tr', {'id': 'nr'}) if a_result == None: return False for result in a_result: find_result.append({ "name": result.td.a.contents[0], "url": result.td.a['href'] }) else: a_url = content_keyword.find('meta', {'property': 'og:url'}) if a_url == None: return False a_name = content_keyword.find('meta', {'property': 'og:novel:book_name'}) if a_name == None: return False find_result.append({ "name": a_name["content"], "url": a_url['content'] }) for result in find_result: if mode == "download": #判断此漫画是否已经下载过 sql = "SELECT * FROM EntertainmentDB.tbl_fiction_name WHERE name=\"%s\";" % ( result["name"]) if self._EntertainmentSelect(sql): print(result["name"]) continue #等待上一部小说下载完成 while True: if not priority_queue.empty(): print("threads conunt :%d" % threading.active_count()) print("queue size : %d" % (priority_queue.qsize())) if threading.active_count() < 10: StartFictionThread(10) time.sleep(60) continue else: break self.keyword = result["name"] soup_keyword_content = BaseRequest.GetUrlSoup( result["url"], 'gbk') if soup_keyword_content == None: return False a_name = soup_keyword_content.find( 'meta', {'property': 'og:novel:book_name'}) a_introduce = soup_keyword_content.find( 'meta', {'property': 'og:description'}) a_image = soup_keyword_content.find('meta', {'property': 'og:image'}) a_category = soup_keyword_content.find( 'meta', {'property': 'og:novel:category'}) a_author = soup_keyword_content.find( 'meta', {'property': 'og:novel:author'}) a_url = soup_keyword_content.find( 'meta', {'property': 'og:novel:read_url'}) a_status = soup_keyword_content.find( 'meta', {'property': 'og:novel:status'}) a_list = soup_keyword_content.find('div', {'id': 'list'}) a_book = a_list.dl.find_all('dd') #下载封面图片 for i in range(5): if download_path != None: path = '%s/Fiction/%s/' % (download_path, self.keyword) if not BaseRequest.DownloadData( a_image['content'], path, "封面.jpg"): print("download %s failed %d time" % ("封面.jpg", i)) else: print("download %s%s success" % (path, "封面.jpg")) break src = "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Fiction/" + self.keyword + "/" + "封面.jpg" #将漫画信息存储到数据库 sql_dict = collections.OrderedDict() sql_dict['name'] = "\"" + a_name['content'] + "\"" #名字 sql_dict['watch_count'] = 0 #编号 sql_dict['website'] = "\"" + self._url + "\"" #网址 sql_dict['chapter_count'] = len(a_book) #总共有多少章节 sql_dict[ 'introduce'] = "\"" + a_introduce['content'] + "\"" #漫画介绍 sql_dict['author'] = "\"" + a_author['content'] + "\"" #作者 sql_dict['cover_img_src'] = "\"" + src + "\"" #封面图片 sql_dict['type'] = "\"" + a_category['content'] + "\"" #漫画类型 sql_dict['add_time'] = "\"" + time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 if "连载中" in a_status['content']: sql_dict['is_finish'] = 0 #是否完结 else: sql_dict['is_finish'] = 1 if not self._EntertainmentInsert('tbl_fiction_name', sql_dict): print("inster tbl_fiction_name table failed!") continue #获取漫画编号,唯一 sql = "SELECT ID FROM EntertainmentDB.tbl_fiction_name WHERE name=\"%s\";" % ( a_name['content']) max_id = self._EntertainmentSelect(sql) if max_id: self.id = max_id[0][0] else: print("get max_id failed!") continue elif mode == "update": now_Time = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 sql = "update EntertainmentDB.tbl_fiction_name set add_time = %s where pk_id = %d;" % ( now_Time, self.id) if not self._EntertainmentUpdate(sql): print("%s update failed!" % (sql)) count = 1 for book in a_book: href = book.a['href'] title = book.a.contents[0] #当前章节的内容插入到队列中 url_a_book = self._url + href data = { "ID": self.id, "url": url_a_book, "title": title, "href": href, "count": count } if mode == "download": dic_queue = { "type": "download", "subtype": "download", "self": self, "data": data } elif mode == "update": dic_queue = { "type": "download", "subtype": "update", "self": self, "data": data } priority_queue.put(base.Job(2, dic_queue, self._url)) count += 1 return True