def _DownloadImg(self, chapter_num, download_path, dct_img_book): """下载封面图片和漫画 Parameters ---------- chapter_num : int 章节号 download_path : str or unicode 存储路径 dct_img_book : str or unicode 需要存储的数据,格式为 { 'href': '', # 每个章节的地址 'title': '', # 每个章节的名字 'src': '', # 每个章节的封面地址 'download_url': [list], # 当前章节的所有内容的地址 } Returns ------- success: True failed : False """ #标题及封面路径 title = dct_img_book['title'] src = dct_img_book['src'] download_path = '%s/comics/%s/%s/' %(download_path, self.keyword, title) #下载封面图片 count = 0 file_name = 'fengmian.jpg' if not BaseRequest.DownloadData(src, download_path, file_name): print("download fengmian.jpg failed") return False #下载漫画内容,图片按1开始自增 for url in dct_img_book['download_url']: count += 1 file_name = '%d.jgp' %(count) for i in range(5): if not BaseRequest.DownloadData(url, download_path, file_name): print("download %s failed %d time" % (file_name, i)) else: break #将章节信息保存到章节表中 sql_dict = collections.OrderedDict() sql_dict['ChapterNum'] = chapter_num #章节号 sql_dict['ChapterName'] = "\"" + title + "\"" #章节名称 sql_dict['PicNum'] = count #图片数量 sql_dict['Dept_ID'] = self.id #外键,关联漫画名称表,与其id相同 #插入数据到章节表中 if not self._ComicInsert('ComicChapter', sql_dict): print("inster ComicChapter table failed!") return True
def async(self, _json, _socket): """ 异步模式的业务处理,并发业务 :param _json: 传入的数据需要为 json 格式 :param _socket: :return: """ # 实例化 request r = BaseRequest.new(_json, self, _socket) # 执行逻辑 r.doIt()
def async (self, dic, _socket): """ 异步模式的业务处理,并发业务 :param dic: 传入的数据需要为 dic 格式 :param _socket: :return: """ # 实例化 request r = BaseRequest.new(dic, _socket) # 保存 tag self.saveRequestTag(_socket, r.tag) # 执行逻辑 r.doIt()
def _UpdataChapter(self, result, download_path=None): """更新最新章节,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 Returns ------- success: dict[list]--self.kkmh_content failed : None """ keyword = result[1] chapter_num = result[4] self.id = result[0] #请求keyword网页 self.keyword = keyword self.download_path = download_path url_keyword = self._url + '/e/search/' keyword_encode = keyword.encode('gbk','strict'); button_encode = "搜索漫画".encode('gbk','strict'); params = { 'key':keyword_encode, 'button':button_encode, } params = parse.urlencode(params).encode("gbk") content_keyword = BaseRequest.PostUrlSoup(url_keyword, params, 'gbk') if content_keyword == None: return None a_result = content_keyword.find_all('p',{'class':'fl cover'}) #取出id关键字,从而访问搜索到的内容 for data in a_result: #获取漫画编号,唯一 if data.a.img['alt'] != keyword: continue url_keyword_content = self._url + "/" + data.a['href'] soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content, 'gbk') if soup_keyword_content == None: return None #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环 book = soup_keyword_content.find('div',{'class':'plist pnormal','id':'play_0'}) a_book = [] for data_content in book.ul: a = data_content.find('a') if a != None and a != -1: a_book.append(a) now_chapter_num = len(a_book) for book in a_book: print(now_chapter_num, chapter_num) if now_chapter_num <= chapter_num: return None href = book['href'] title = book['title'] lst_img_book = [] dct_img_book = {} title = title.replace(' ','') #下载当前章节的内容 url_a_book = self._url + href soup_a_book = BaseRequest.GetUrlSoup(url_a_book, 'gbk') if soup_a_book == None: return None url_list = url_a_book.split("/") comic_id = url_list[-2] chapter_id = url_list[-1][0:-5] for i in range(20): download_url = "http://mhpic5.lineinfo.cn/mh160tuku/s/" keyword_encode = parse.urlencode({"": self.keyword}) title_encode = parse.urlencode({"": title}) name = '' if (i+1) < 10: name = '/000' + str(i+1) + ".jpg" elif (i+1) >= 10 and (i+1) < 100: name = '/00' + str(i+1) + ".jpg" else: name = '/0' + str(i+1) + ".jpg" download_url = download_url + keyword_encode[1:len(keyword_encode)] + "_" + comic_id + "/" + title_encode[1:len(title_encode)] + "_" + chapter_id + name lst_img_book.append(download_url) #将数据存储到结构体中,用于后续保存 dct_img_book = {'href':href, 'title':title, 'chapter':now_chapter_num, 'download_url':lst_img_book} self.lst_kkmh_content.append(dct_img_book) now_chapter_num = now_chapter_num - 1 yield dct_img_book
def _GetContentByKeyword(self, keyword, mode, download_path=None): """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 mode : str or unicode download : 下载 updata :更新图片 Returns ------- success: dict[list]--self.kkmh_content failed : None """ #请求keyword网页 self.keyword = keyword self.download_path = download_path url_keyword = self._url + '/e/search/' keyword_encode = keyword.encode('gbk','strict'); button_encode = "搜索漫画".encode('gbk','strict'); params = { 'key':keyword_encode, 'button':button_encode, } params = parse.urlencode(params).encode("gbk") content_keyword = BaseRequest.PostUrlSoup(url_keyword, params, 'gbk') if content_keyword == None: return False a_result = content_keyword.find_all('p',{'class':'fl cover'}) #取出id关键字,从而访问搜索到的内容 for data in a_result: if mode == "download": #判断此漫画是否已经下载过 sql = "SELECT * FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data.a.img['alt']) if self._EntertainmentSelect(sql): print("%s 已经下载过,请查看数据库" % data.a.img['alt'] ) continue #等待上一部漫画下载完成 while True: if not priority_queue.empty(): print("threads conunt :%d" %threading.active_count()) print("queue size : %d" %(priority_queue.qsize())) if threading.active_count() < 10: StartComicThread(10) time.sleep(60) continue else: break self.keyword = data.a.img['alt'] print(self.keyword) url_keyword_content = self._url + "/" + data.a['href'] soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content, 'gbk') if soup_keyword_content == None: return False #将漫画信息存储到数据库 sql_dict = collections.OrderedDict() sql_dict['Name'] = "\"" + self.keyword + "\"" #名字 sql_dict['WatchNum'] = 0 #编号 sql_dict['Website'] = "\"" + self._url + "\"" #网址 #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环 book = soup_keyword_content.find('div',{'class':'plist pnormal','id':'play_0'}) a_book = [] for data_content in book.ul: a = data_content.find('a') if a != None and a != -1: a_book.append(a) if mode == "download": a_author = soup_keyword_content.find('meta', {'property':'og:novel:author'}) a_category = soup_keyword_content.find('meta', {'property':'og:novel:category'}) a_img = soup_keyword_content.find('meta', {'property':'og:image'}) a_introduce = soup_keyword_content.find('p', {'id':'intro'}) IsFinish = soup_keyword_content.find('meta', {'property':'og:novel:status'}) if (IsFinish['content'] == '连载中'): a_isfinish = 0 else: a_isfinish = 1 #下载漫画封面 for i in range(5): if download_path != None: path = '%s/Comics/%s/' %(download_path, self.keyword) if not BaseRequest.DownloadData(a_img['content'], path, "封面.jpg"): print("download %s failed %d time" % ("封面.jpg", i)) else: print("download %s%s success" % (path,"封面.jpg")) break src = "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Comics/" + self.keyword + "/" + "封面.jpg" #将漫画信息存储到数据库 sql_dict = collections.OrderedDict() sql_dict['Name'] = "\"" + self.keyword + "\"" #名字 sql_dict['WatchNum'] = 0 #编号 sql_dict['Website'] = "\"" + self._url + "\"" #网址 sql_dict['ChapterNum']= len(a_book) #总共有多少章节 sql_dict['IsFinish'] = a_isfinish #是否完结 sql_dict['Introduce'] = "\"" + a_introduce.a.contents[0] + "\"" #漫画介绍 sql_dict['Author'] = "\"" + a_author['content'] + "\"" #作者 sql_dict['Img'] = "\"" + src + "\"" #封面图片 sql_dict['Type'] = "\"" + a_category['content'] + "\"" #漫画类型 sql_dict['Time'] = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 if not self._EntertainmentInsert('ComicName', sql_dict): print("inster ComicName table failed!") continue #获取漫画编号,唯一 sql = "SELECT ID FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data.a.img['alt']) max_id = self._EntertainmentSelect(sql) if max_id: self.id = max_id[0][0] else: print("get max_id failed!") continue elif mode == "update": now_Time = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 sql = "update EntertainmentDB.ComicName set Time = %s where ID = %d;" %(now_Time, self.id) if not self._EntertainmentUpdate(sql): print("%s update failed!" %(sql)) count = 1 for book in reversed(a_book): href = book['href'] title = book['title'] #当前章节的内容插入到队列中 url_a_book = self._url + href data = {"url": url_a_book, "title":title, "href":href, "count": count} if mode == "download": dic_queue = {"type": "download", "subtype": "download", "self":self, "data":data} elif mode == "update": dic_queue = {"type": "download", "subtype": "update", "self":self, "data":data} priority_queue.put(base.Job(2,dic_queue,self._url)) count += 1 return True
def _UpdataChapter(self, result, download_path=None): """更新最新章节,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 Returns ------- success: dict[list]--self.kkmh_content failed : None """ keyword = result[1] chapter_num = result[4] self.id = result[0] #请求keyword网页 self.keyword = keyword self.download_path = download_path url_keyword = self._url + '/search.html' keyword_encode = keyword.encode('big5','strict'); params = { 'keyword':keyword_encode, 'searchtype':'all', } params = parse.urlencode(params).encode("big5") content_keyword = BaseRequest.PostUrlSoup(url_keyword, params, 'big5') if content_keyword == None: return None a_result = content_keyword.find_all('span',{'class':'covertxt'}) #取出id关键字,从而访问搜索到的内容 for data in a_result: data_next_siblings = data.find_next_siblings() print(data_next_siblings[0]['title'], keyword) if data_next_siblings[0]['title'] != keyword: continue url_keyword_content = self._url + "/" + data_next_siblings[0]['href'] soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content, 'big5') if soup_keyword_content == None: return None #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环 save_content = soup_keyword_content.find_all('table',{'width':'688', 'align':'center'}) if save_content == None: return None a_book = [] for data_content in save_content[0].tbody: for data_td in data_content: a = data_td.find('a') if a != None and a != -1: a_book.append(a) now_chapter_num = len(a_book) for book in reversed(a_book): print(now_chapter_num, chapter_num) if now_chapter_num <= chapter_num: return None href = book['href'] title = book.contents[0] lst_img_book = [] dct_img_book = {} #下载当前章节的内容 url_a_book = self._url + href soup_a_book = BaseRequest.GetUrlSoup(url_a_book, 'big5') if soup_a_book == None: return None content_img_book = soup_a_book.find_all('img',{'oncontextmenu':'return false'}) img_num = soup_a_book.find_all('option', value=True) for num in range(len(img_num)): img = content_img_book[0]['src'] img = img[0:(len(img)-7)] if (num+1) < 10: img = img + '00' + str(num+1) + ".jpg" elif (num+1) >= 10 and (num+1) < 100: img = img + '0' + str(num+1) + ".jpg" else: img = img + str(num+1) + ".jpg" lst_img_book.append(img) #将数据存储到结构体中,用于后续保存 dct_img_book = {'href':href, 'title':title, 'chapter':now_chapter_num, 'download_url':lst_img_book} self.lst_kkmh_content.append(dct_img_book) now_chapter_num = now_chapter_num - 1 yield dct_img_book
def run(): chunk = None #如果300秒没有数据则退出线程 chunk = priority_queue.get() chunkData = chunk.description #下载漫画 if chunkData["type"] == "download": data = chunkData["data"] subtype = chunkData["subtype"] selfComic = chunkData["self"] title = data["title"] url_a_book = data["url"] count = data["count"] href = data["href"] lst_img_book = [] dct_img_book = {} if "http://www.kuaikanmanhua.com" in url_a_book: soup_a_book = BaseRequest.GetUrlSoup(url_a_book) if soup_a_book != None: print(count, title) #找到每一章节的图片地址并保存 content_img_book = soup_a_book.find_all('img',{'class':'kklazy', 'title':title}) for img_book in content_img_book: lst_img_book.append(img_book['data-kksrc'].replace('amp;', '')) elif "http://www.cartoonmad.com" in url_a_book: title = title.replace(' ','') soup_a_book = BaseRequest.GetUrlSoup(url_a_book, 'big5') if soup_a_book != None: print(count, title) content_img_book = soup_a_book.find_all('img',{'oncontextmenu':'return false'}) img_num = soup_a_book.find_all('option', value=True) for num in range(len(img_num)): img = content_img_book[0]['src'] img = img[0:(len(img)-7)] if (num+1) < 10: img = img + '00' + str(num+1) + ".jpg" elif (num+1) >= 10 and (num+1) < 100: img = img + '0' + str(num+1) + ".jpg" else: img = img + str(num+1) + ".jpg" lst_img_book.append(img) elif "http://www.mh160.com" in url_a_book: title = title.replace(' ','') soup_a_book = BaseRequest.GetUrlSoup(url_a_book, 'gbk') if soup_a_book != None: print(count, title) url_list = url_a_book.split("/") comic_id = url_list[-2] chapter_id = url_list[-1][0:-5] for i in range(20): download_url = "http://mhpic5.lineinfo.cn/mh160tuku/s/" keyword_encode = parse.urlencode({"": selfComic.keyword}) title_encode = parse.urlencode({"": title}) name = '' if (i+1) < 10: name = '/000' + str(i+1) + ".jpg" elif (i+1) >= 10 and (i+1) < 100: name = '/00' + str(i+1) + ".jpg" else: name = '/0' + str(i+1) + ".jpg" download_url = download_url + keyword_encode[1:len(keyword_encode)] + "_" + comic_id + "/" + title_encode[1:len(title_encode)] + "_" + chapter_id + name lst_img_book.append(download_url) else: print("%s download faild"%(title)) #将数据存储到结构体中,用于后续保存 dct_img_book = {'href':href, 'title':title, 'chapter':count, 'download_url':lst_img_book} dic_queue = {} if subtype == "download": dic_queue = {"type": "insert", "data": dct_img_book, "self": selfComic} elif subtype == "update": dic_queue = {"type": "update", "data": dct_img_book, "self": selfComic} if not selfComic._InsertImg(dct_img_book['chapter'], dct_img_book, selfComic.download_path): print("insert %s failed" %(dct_img_book["title"])) #插入到数据库 elif chunkData["type"] == "insert": dct_img_book = chunkData["data"] selfComic = chunkData["self"] if not selfComic._InsertImg(dct_img_book['chapter'], dct_img_book, selfComic.download_path): print("insert %s failed" %(dct_img_book["title"])) del dct_img_book["download_url"][:] #更新数据到数据库 elif chunkData["type"] == "update": dct_img_book = chunkData["data"] cComics = Comics() if not cComics._UpdateImg(dct_img_book): print("update %s failed" %(dct_img_book["title"])) print("update") print(dct_img_book["chapter"], dct_img_book["title"])
def _GetContentByKeyword(self, keyword, mode, download_path=None): """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 mode : str or unicode download : 下载 updata :更新图片 Returns ------- success: dict[list]--self.kkmh_content failed : None """ #请求keyword网页 self.keyword = keyword self.download_path = download_path url_keyword = self._url + '/search.html' keyword_encode = keyword.encode('big5','strict'); params = { 'keyword':keyword_encode, 'searchtype':'all', } params = parse.urlencode(params).encode("big5") content_keyword = BaseRequest.PostUrlSoup(url_keyword, params, 'big5') if content_keyword == None: return False a_result = content_keyword.find_all('span',{'class':'covertxt'}) #取出id关键字,从而访问搜索到的内容 for data in a_result: data_next_siblings = data.find_next_siblings() if mode == "download": #判断此漫画是否已经下载过 sql = "SELECT * FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data_next_siblings[0]['title']) if self._EntertainmentSelect(sql): print(data_next_siblings[0]['title']) continue #等待上一部漫画下载完成 while True: if not priority_queue.empty(): print("threads conunt :%d" %threading.active_count()) print("queue size : %d" %(priority_queue.qsize())) if threading.active_count() < 10: StartComicThread(10) time.sleep(60) continue else: break self.keyword = data_next_siblings[0]['title'] print(self.keyword) url_keyword_content = self._url + "/" + data_next_siblings[0]['href'] soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content, 'big5') if soup_keyword_content == None: return False #将漫画信息存储到数据库 sql_dict = collections.OrderedDict() sql_dict['Name'] = "\"" + self.keyword + "\"" #名字 sql_dict['WatchNum'] = 0 #编号 sql_dict['Website'] = "\"" + self._url + "\"" #网址 save_content = soup_keyword_content.find_all('td',{'width':276}) if save_content == None: return False sql_dict['Type'] = "\"" + save_content[1].a.contents[0].strip() + "\"" sql_dict['Author'] = "\"" + save_content[3].contents[1].strip() + "\"" a_IsFinish = 0 if save_content[5].contents[4]['src'].strip() == "/image/chap1.gif": sql_dict['IsFinish'] = 0 elif save_content[5].contents[4]['src'].strip() == "/image/chap9.gif": sql_dict['IsFinish'] = 1 else: sql_dict['IsFinish'] = 0 save_content = soup_keyword_content.find_all('table',{'width':688,'cellspacing':"8"}) if save_content != None: sql_dict['Introduce'] = "\"" + save_content[0].tr.td.contents[0].strip() + "\"" else: sql_dict['Introduce'] = '' save_content = soup_keyword_content.find_all('img',{'width':'240','height':'320'}) a_img = '' if save_content != None: a_img = self._url + save_content[0]['src'] #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环 save_content = soup_keyword_content.find_all('table',{'width':'688', 'align':'center'}) if save_content == None: return False a_book = [] for data_content in save_content[0].tbody: for data_td in data_content: a = data_td.find('a') if a != None and a != -1: a_book.append(a) if mode == "download": #下载漫画封面 for i in range(5): if download_path != None: path = '%s/Comics/%s/' %(download_path, self.keyword) if not BaseRequest.DownloadData(a_img, path, "封面.jpg"): print("download %s failed %d time" % ("封面.jpg", i)) else: print("download %s%s success" % (path,"封面.jpg")) break src = "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Comics/" + self.keyword + "/" + "封面.jpg" #将漫画信息存储到数据库 sql_dict['Img'] = "\"" + src + "\"" sql_dict['ChapterNum']= len(a_book) #总共有多少章节 sql_dict['Time'] = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 if not self._EntertainmentInsert('ComicName', sql_dict): print("inster ComicName table failed!") continue #获取漫画编号,唯一 sql = "SELECT ID FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(self.keyword) max_id = self._EntertainmentSelect(sql) if max_id: self.id = max_id[0][0] else: print("get max_id failed!") continue elif mode == "update": now_Time = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 sql = "update EntertainmentDB.ComicName set Time = %s where ID = %d;" %(now_Time, self.id) if not self._EntertainmentUpdate(sql): print("%s update failed!" %(sql)) count = 1 for book in (a_book): href = book['href'] title = book.contents[0] #当前章节的内容插入到队列中 url_a_book = self._url + href data = {"url": url_a_book, "title":title, "href":href, "count": count} if mode == "download": dic_queue = {"type": "download", "subtype": "download", "self":self, "data":data} elif mode == "update": dic_queue = {"type": "download", "subtype": "update", "self":self, "data":data} priority_queue.put(base.Job(2,dic_queue,self._url)) count += 1 return True
def _UpdataChapter(self, result, download_path=None): """更新最新章节,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 Returns ------- success: dict[list]--self.kkmh_content failed : None """ keyword = result[1] chapter_num = result[4] self.id = result[0] #请求keyword网页 self.keyword = keyword self.download_path = download_path url_keyword = self._url + '/web/topic/search?keyword' + parse.urlencode({"": keyword}) content_keyword = BaseRequest.GetUrlContent(url_keyword) if content_keyword == False: return None #将返回的内容解析 content_keyword_json = json.loads(content_keyword.decode("utf8")) if content_keyword_json == False: return None #取出id关键字,从而访问搜索到的内容 for data in content_keyword_json['data']['topic']: #获取漫画编号,唯一 if data['title'] != keyword: continue url_keyword_content = self._url + '/web/topic/' + str(data['id']) soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content) if soup_keyword_content == False: return None #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环 a_book = soup_keyword_content.find_all('a',{'class':'article-img'}) now_chapter_num = len(a_book) for book in a_book: print(now_chapter_num, chapter_num) if now_chapter_num <= chapter_num: return None href = book['href'] title = book['title'] lst_img_book = [] dct_img_book = {} #下载当前章节的内容 url_a_book = self._url + href soup_a_book = BaseRequest.GetUrlSoup(url_a_book) if soup_a_book == None: return None #找到每一章节的图片地址并保存 content_img_book = soup_a_book.find_all('img',{'class':'kklazy', 'title':title}) for img_book in content_img_book: lst_img_book.append(img_book['data-kksrc'].replace('amp;', '')) #将数据存储到结构体中,用于后续保存 dct_img_book = {'href':href, 'title':title, 'chapter':now_chapter_num, 'download_url':lst_img_book} self.lst_kkmh_content.append(dct_img_book) now_chapter_num = now_chapter_num - 1 yield dct_img_book
def _GetContentByKeyword(self, keyword, mode, download_path=None): """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 mode : str or unicode download : 下载 updata :更新图片 Returns ------- success: dict[list]--self.kkmh_content failed : None """ #请求keyword网页 self.keyword = keyword self.download_path = download_path url_keyword = self._url + '/web/topic/search?keyword' + parse.urlencode({"": keyword}) content_keyword = BaseRequest.GetUrlContent(url_keyword) if content_keyword == None: return False #将返回的内容解析 content_keyword_json = json.loads(content_keyword.decode("utf8")) if content_keyword_json == False: return False #取出id关键字,从而访问搜索到的内容 for data in content_keyword_json['data']['topic']: if mode == "download": #判断此漫画是否已经下载过 sql = "SELECT * FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data['title']) if self._EntertainmentSelect(sql): print(data['title']) continue #等待上一部漫画下载完成 while True: if not priority_queue.empty(): print("threads conunt :%d" %threading.active_count()) print("queue size : %d" %(queue.qsize())) if threading.active_count() < 10: StartComicThread(10) time.sleep(60) continue else: break self.keyword = data['title'] url_keyword_content = self._url + '/web/topic/' + str(data['id']) soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content) if soup_keyword_content == None: return False #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环 a_book = soup_keyword_content.find_all('a',{'class':'article-img'}) if mode == "download": a_author = soup_keyword_content.find('div', {'class':'author-nickname'}) a_introduce = soup_keyword_content.find('div', {'class':'switch-content'}) a_img = soup_keyword_content.find('img', {'class':'kk-img'}) #下载漫画封面 for i in range(5): if download_path != None: path = '%s/Comics/%s/' %(download_path, self.keyword) if not BaseRequest.DownloadData(a_img['src'], path, "封面.jpg"): print("download %s failed %d time" % ("封面.jpg", i)) else: print("download %s%s success" % (path,"封面.jpg")) break src = "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Comics/" + self.keyword + "/" + "封面.jpg" #将漫画信息存储到数据库 sql_dict = collections.OrderedDict() sql_dict['Name'] = "\"" + data['title'] + "\"" #名字 sql_dict['WatchNum'] = 0 #编号 sql_dict['Website'] = "\"" + self._url + "\"" #网址 sql_dict['ChapterNum']= len(a_book) #总共有多少章节 sql_dict['IsFinish'] = 0 #是否完结 sql_dict['Introduce'] = "\"" + a_introduce.p.contents[0].replace('\"', '') + "\"" #漫画介绍 sql_dict['Author'] = "\"" + a_author.contents[0] + "\"" #作者 sql_dict['Img'] = "\"" + src + "\"" #封面图片 sql_dict['Type'] = "\"" + self.type + "\"" #漫画类型 sql_dict['Time'] = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 if not self._EntertainmentInsert('ComicName', sql_dict): print("inster ComicName table failed!") continue #获取漫画编号,唯一 sql = "SELECT ID FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data['title']) max_id = self._EntertainmentSelect(sql) if max_id: self.id = max_id[0][0] else: print("get max_id failed!") continue elif mode == "update": now_Time = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 sql = "update EntertainmentDB.ComicName set Time = %s where ID = %d;" %(now_Time, self.id) if not self._EntertainmentUpdate(sql): print("%s update failed!" %(sql)) count = 1 for book in reversed(a_book): href = book['href'] title = book['title'] src = book.img['src'] #当前章节的内容插入到队列中 url_a_book = self._url + href data = {"url": url_a_book, "title":title, "src": src, "href":href, "count": count} if mode == "download": dic_queue = {"type": "download", "subtype": "download", "self":self, "data":data} elif mode == "update": dic_queue = {"type": "download", "subtype": "update", "self":self, "data":data} priority_queue.put(base.Job(2,dic_queue,self._url)) count += 1 p.spawn(run) p.join() return True
def _GetContentByKeyword(self, keyword): """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 Returns ------- success: dict[list]--self.kkmh_content failed : None """ #请求keyword网页 self.keyword = keyword url_keyword = self._url + '/web/topic/search?keyword' + parse.urlencode({"": keyword}) content_keyword = BaseRequest.GetUrlContent(url_keyword) if content_keyword == False: return None #将返回的内容解析 content_keyword_json = json.loads(content_keyword.decode("utf8")) if content_keyword_json == False: return None #取出id关键字,从而访问搜索到的内容 url_keyword_content = self._url + '/web/topic/' + str(content_keyword_json['data']['topic'][0]['id']) soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content) if soup_keyword_content == False: return None #将漫画信息存储到数据库 sql_dict = collections.OrderedDict() sql_dict['Name'] = "\"" + self.keyword + "\"" #名字 sql_dict['Num'] = 0 #编号 sql_dict['Website'] = "\"" + self._url + "\"" #网址 sql_dict['Time'] = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 if not self._ComicInsert('ComicName', sql_dict): print("inster ComicName table failed!") #获取漫画编号,唯一 sql = "SELECT ID FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(self.keyword) max_id = self._ComicSelect(sql) self.id = max_id[0][0] #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环 a_book = soup_keyword_content.find_all('a',{'class':'article-img'}) for book in reversed(a_book): href = book['href'] title = book['title'] src = book.img['src'] lst_img_book = [] dct_img_book = {} #下载当前章节的内容 url_a_book = self._url + href soup_a_book = BaseRequest.GetUrlSoup(url_a_book) if soup_a_book == None: return None #找到每一章节的图片地址并保存 content_img_book = soup_a_book.find_all('img',{'class':'kklazy', 'title':title}) for img_book in content_img_book: lst_img_book.append(img_book['data-kksrc'].replace('amp;', '')) #将数据存储到结构体中,用于后续保存 dct_img_book = {'href':href, 'title':title, 'src':src, 'download_url':lst_img_book} self.lst_kkmh_content.append(dct_img_book) yield dct_img_book
def _InsertFiction(self, chapter_num, dct_img_book, download_path=None): """下载封面图片和漫画 Parameters ---------- chapter_num : int 章节号 download_path : str or unicode 存储路径 dct_img_book : str or unicode 需要存储的数据,格式为 { 'href': '', # 每个章节的地址 'title': '', # 每个章节的名字 'src': '', # 每个章节的封面地址 'download_url': [list], # 当前章节的所有内容的地址 } Returns ------- success: True failed : False """ #标题及封面路径 title = dct_img_book['title'] content = dct_img_book['content'] ID = self.id chapter_num = dct_img_book['chapter'] path = '%s/Fiction/%s/' % (download_path, self.keyword) #下载封面图片 count = 0 #下载漫画内容,图片按1开始自增 file_name = '%s.txt' % (title) BaseRequest.SaveData(path, file_name, content) #将章节信息保存到章节表中 sql_dict = collections.OrderedDict() sql_dict['ChapterNum'] = chapter_num #章节号 sql_dict['ChapterName'] = "\"" + title + "\"" #章节名称 sql_dict[ 'ContentUrl'] = "\"" + "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Fiction/" + self.keyword + "/" + file_name + "\"" #小说内容 sql_dict['Dept_ID'] = ID #外键,关联漫画名称表,与其id相同 #sql = "INSERT INTO EntertainmentDB.FictionChapter(ChapterNum,ChapterName,Content,Dept_ID) VALUES (%s,%s,%s,%s);" %(sql_dict['ChapterNum'],sql_dict['ChapterName'],content,ID) #if database.gHandleDatabase.InsertData(sql): # pass #else: # print("%s intsert failed!" %(sql)) # result = False #return True #插入数据到章节表中 for i in range(10): if not self._EntertainmentInsert('FictionChapter', sql_dict): print("inster FictionChapter table failed!") time.sleep(2) else: break return True
def _InsertImg(self, chapter_num, dct_img_book, download_path=None): """下载封面图片和漫画 Parameters ---------- chapter_num : int 章节号 download_path : str or unicode 存储路径 dct_img_book : str or unicode 需要存储的数据,格式为 { 'href': '', # 每个章节的地址 'title': '', # 每个章节的名字 'src': '', # 每个章节的封面地址 'download_url': [list], # 当前章节的所有内容的地址 } Returns ------- success: True failed : False """ #标题及封面路径 title = dct_img_book['title'] ID = self.id chapter_num = dct_img_book['chapter'] path = '%s/Comics/%s/%s/' % (download_path, self.keyword, title) """ #下载封面图片 file_name = 'fengmian.jpg' if not BaseRequest.DownloadData(src, download_path, file_name): print("download fengmian.jpg failed") return False """ #下载漫画内容,图片按1开始自增 success_count = 0 count = 1 for url in dct_img_book['download_url']: file_name = '%d.jpg' % (count) if not BaseRequest.DownloadData(url, path, file_name): print("download %s failed %d time" % (file_name, i)) else: #print("download %s%s success %d" % (path,file_name,success_count)) success_count = success_count + 1 count += 1 #将章节信息保存到章节表中 sql_dict = collections.OrderedDict() sql_dict['ChapterNum'] = chapter_num #章节号 sql_dict['ChapterName'] = "\"" + title + "\"" #章节名称 sql_dict['PicNum'] = success_count #图片数量 #sql_dict['Img'] = "\"" + dct_img_book['src'] + "\"" #封面图片 sql_dict['Dept_ID'] = ID #外键,关联漫画名称表,与其id相同 #插入数据到章节表中 for i in range(10): if not self._EntertainmentInsert('ComicChapter', sql_dict): print("inster ComicChapter table failed!") time.sleep(2) else: break key = "Page_Num, Comic_ID, Comic_ChapterNum, Img_src" value = [] count = 1 #下载漫画内容,图片按1开始自增,批量插入 for i in range(success_count + 1): file_name = '%d.jpg' % (count) count = count + 1 src = "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Comics/" + self.keyword + "/" + title + "/" + file_name data = (i + 1, ID, chapter_num, src) value.append(data) #sql_src = collections.OrderedDict() #sql_src['Page_Num'] = count #sql_src['Comic_ID'] = self.id #sql_src['Comic_ChapterNum'] = chapter_num #sql_src['Img_src'] = "\"" + url + "\"" #过快的插入会导致插入失败,如果发现失败,等待两秒,然后再次插入 for i in range(10): #插入数据到章节表中 if not self._EntertainmentInsertMany('ComicImgSrc', key, value): print("inster ComicImgSrc table failed!") time.sleep(2) else: break return True
def run(self): IsHaveData = True while IsHaveData: chunk = None #如果300秒没有数据则退出线程 try: chunk = self.queue.get(block=True, timeout=60) except queue.Empty: IsHaveData = False continue chunkData = chunk.description #下载漫画 if chunkData["type"] == "download": data = chunkData["data"] subtype = chunkData["subtype"] selfFiction = chunkData["self"] title = data["title"] url_a_book = data["url"] count = data["count"] href = data["href"] ID = data["ID"] dct_book = {} soup_a_book = BaseRequest.GetUrlSoup(url_a_book, 'gbk') if soup_a_book != None: print(count, title) #找到每一章节的图片地址并保存 if "http://www.biquge.com.tw" in url_a_book: content_book = soup_a_book.find( 'div', {'id': 'content'}) content = "" for x in content_book.contents: if "qidian" not in str(x) and "http" not in str(x): content = content + ''.join(str(x)) content = content.replace("\"", "“") content = content.replace("\'", "\‘") content = content.replace("\\xC2\\xA0", " ") content = content.replace("\xa0", " ") #将数据存储到结构体中,用于后续保存 dct_book = { 'href': href, 'title': title, 'chapter': count, 'content': content } dic_queue = {} if subtype == "download": dic_queue = { "type": "insert", "data": dct_book, "self": selfFiction } elif subtype == "update": dic_queue = { "type": "update", "data": dct_book, "self": selfFiction } self.queue.put(Job(1, dic_queue)) else: print("%s download faild" % (title)) #插入到数据库 elif chunkData["type"] == "insert": dct_book = chunkData["data"] selfFiction = chunkData["self"] if not selfFiction._InsertFiction(dct_book['chapter'], dct_book, selfFiction.download_path): print("insert %s failed" % (dct_book["title"]))
def _UpdataChapter(self, result): """更新最新章节,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 Returns ------- success: dict[list]--self.kkmh_content failed : None """ keyword = result[1] chapter_num = result[4] self.id = result[0] #请求keyword网页 self.keyword = keyword url_keyword = self._url + '/modules/article/soshu.php?searchkey=' + parse.quote( keyword, encoding='gbk', errors='replace') content_keyword = BaseRequest.GetUrlSoup(url_keyword, 'gbk') if content_keyword == None: return None #将返回的内容解析 find_result = [] if content_keyword.find('caption'): a_result = content_keyword.find_all('tr', {'id': 'nr'}) if a_result == None: return None for result in a_result: find_result.append({ "name": result.td.a.contents[0], "url": result.td.a['href'] }) else: a_url = content_keyword.find('meta', {'property': 'og:url'}) if a_url == None: return None a_name = content_keyword.find('meta', {'property': 'og:novel:book_name'}) if a_name == None: return None find_result.append({ "name": a_name["content"], "url": a_url['content'] }) #取出id关键字,从而访问搜索到的内容 for result in find_result: #获取漫画编号,唯一 if result["name"] != keyword: continue soup_keyword_content = BaseRequest.GetUrlSoup(result["url"], 'gbk') if soup_keyword_content == None: return None #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环 a_list = soup_keyword_content.find('div', {'id': 'list'}) a_book = a_list.dl.find_all('dd') now_chapter_num = len(a_book) for book in reversed(a_book): print(now_chapter_num, chapter_num) if now_chapter_num <= chapter_num: return None href = book.a['href'] title = book.a.contents[0] url_a_book = self._url + href lst_img_book = [] dct_img_book = {} dct_book = {} soup_a_book = BaseRequest.GetUrlSoup(url_a_book, 'gbk') if soup_a_book == None: return None #找到每一章节的图片地址并保存 content_book = soup_a_book.find('div', {'id': 'content'}) content = "" for x in content_book.contents: if "qidian" not in str(x) and "http" not in str(x): content = content + ''.join(str(x)) content = content.replace("\"", "“") content = content.replace("\'", "\‘") content = content.replace("\\xC2\\xA0", " ") content = content.replace("\xa0", " ") dct_book = { 'href': href, 'title': title, 'chapter': now_chapter_num, 'content': content } now_chapter_num = now_chapter_num - 1 yield dct_book
def _GetContentByKeyword(self, keyword, mode, download_path=None): """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 mode : str or unicode download : 下载 updata :更新图片 Returns ------- success: dict[list]--self.kkmh_content failed : None """ #请求keyword网页 self.keyword = keyword self.download_path = download_path url_keyword = self._url + '/modules/article/soshu.php?searchkey=' + parse.quote( keyword, encoding='gbk', errors='replace') content_keyword = BaseRequest.GetUrlSoup(url_keyword, 'gbk') if content_keyword == None: return False #将返回的内容解析 find_result = [] if content_keyword.find('caption'): a_result = content_keyword.find_all('tr', {'id': 'nr'}) if a_result == None: return False for result in a_result: find_result.append({ "name": result.td.a.contents[0], "url": result.td.a['href'] }) else: a_url = content_keyword.find('meta', {'property': 'og:url'}) if a_url == None: return False a_name = content_keyword.find('meta', {'property': 'og:novel:book_name'}) if a_name == None: return False find_result.append({ "name": a_name["content"], "url": a_url['content'] }) for result in find_result: if mode == "download": #判断此漫画是否已经下载过 sql = "SELECT * FROM EntertainmentDB.tbl_fiction_name WHERE name=\"%s\";" % ( result["name"]) if self._EntertainmentSelect(sql): print(result["name"]) continue #等待上一部小说下载完成 while True: if not priority_queue.empty(): print("threads conunt :%d" % threading.active_count()) print("queue size : %d" % (priority_queue.qsize())) if threading.active_count() < 10: StartFictionThread(10) time.sleep(60) continue else: break self.keyword = result["name"] soup_keyword_content = BaseRequest.GetUrlSoup( result["url"], 'gbk') if soup_keyword_content == None: return False a_name = soup_keyword_content.find( 'meta', {'property': 'og:novel:book_name'}) a_introduce = soup_keyword_content.find( 'meta', {'property': 'og:description'}) a_image = soup_keyword_content.find('meta', {'property': 'og:image'}) a_category = soup_keyword_content.find( 'meta', {'property': 'og:novel:category'}) a_author = soup_keyword_content.find( 'meta', {'property': 'og:novel:author'}) a_url = soup_keyword_content.find( 'meta', {'property': 'og:novel:read_url'}) a_status = soup_keyword_content.find( 'meta', {'property': 'og:novel:status'}) a_list = soup_keyword_content.find('div', {'id': 'list'}) a_book = a_list.dl.find_all('dd') #下载封面图片 for i in range(5): if download_path != None: path = '%s/Fiction/%s/' % (download_path, self.keyword) if not BaseRequest.DownloadData( a_image['content'], path, "封面.jpg"): print("download %s failed %d time" % ("封面.jpg", i)) else: print("download %s%s success" % (path, "封面.jpg")) break src = "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Fiction/" + self.keyword + "/" + "封面.jpg" #将漫画信息存储到数据库 sql_dict = collections.OrderedDict() sql_dict['name'] = "\"" + a_name['content'] + "\"" #名字 sql_dict['watch_count'] = 0 #编号 sql_dict['website'] = "\"" + self._url + "\"" #网址 sql_dict['chapter_count'] = len(a_book) #总共有多少章节 sql_dict[ 'introduce'] = "\"" + a_introduce['content'] + "\"" #漫画介绍 sql_dict['author'] = "\"" + a_author['content'] + "\"" #作者 sql_dict['cover_img_src'] = "\"" + src + "\"" #封面图片 sql_dict['type'] = "\"" + a_category['content'] + "\"" #漫画类型 sql_dict['add_time'] = "\"" + time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 if "连载中" in a_status['content']: sql_dict['is_finish'] = 0 #是否完结 else: sql_dict['is_finish'] = 1 if not self._EntertainmentInsert('tbl_fiction_name', sql_dict): print("inster tbl_fiction_name table failed!") continue #获取漫画编号,唯一 sql = "SELECT ID FROM EntertainmentDB.tbl_fiction_name WHERE name=\"%s\";" % ( a_name['content']) max_id = self._EntertainmentSelect(sql) if max_id: self.id = max_id[0][0] else: print("get max_id failed!") continue elif mode == "update": now_Time = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 sql = "update EntertainmentDB.tbl_fiction_name set add_time = %s where pk_id = %d;" % ( now_Time, self.id) if not self._EntertainmentUpdate(sql): print("%s update failed!" % (sql)) count = 1 for book in a_book: href = book.a['href'] title = book.a.contents[0] #当前章节的内容插入到队列中 url_a_book = self._url + href data = { "ID": self.id, "url": url_a_book, "title": title, "href": href, "count": count } if mode == "download": dic_queue = { "type": "download", "subtype": "download", "self": self, "data": data } elif mode == "update": dic_queue = { "type": "download", "subtype": "update", "self": self, "data": data } priority_queue.put(base.Job(2, dic_queue, self._url)) count += 1 return True