def _GetContentByKeyword(self, keyword, mode, download_path=None): """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 mode : str or unicode download : 下载 updata :更新图片 Returns ------- success: dict[list]--self.kkmh_content failed : None """ #请求keyword网页 self.keyword = keyword self.download_path = download_path url_keyword = self._url + '/web/topic/search?keyword' + parse.urlencode({"": keyword}) content_keyword = BaseRequest.GetUrlContent(url_keyword) if content_keyword == None: return False #将返回的内容解析 content_keyword_json = json.loads(content_keyword.decode("utf8")) if content_keyword_json == False: return False #取出id关键字,从而访问搜索到的内容 for data in content_keyword_json['data']['topic']: if mode == "download": #判断此漫画是否已经下载过 sql = "SELECT * FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data['title']) if self._EntertainmentSelect(sql): print(data['title']) continue #等待上一部漫画下载完成 while True: if not priority_queue.empty(): print("threads conunt :%d" %threading.active_count()) print("queue size : %d" %(queue.qsize())) if threading.active_count() < 10: StartComicThread(10) time.sleep(60) continue else: break self.keyword = data['title'] url_keyword_content = self._url + '/web/topic/' + str(data['id']) soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content) if soup_keyword_content == None: return False #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环 a_book = soup_keyword_content.find_all('a',{'class':'article-img'}) if mode == "download": a_author = soup_keyword_content.find('div', {'class':'author-nickname'}) a_introduce = soup_keyword_content.find('div', {'class':'switch-content'}) a_img = soup_keyword_content.find('img', {'class':'kk-img'}) #下载漫画封面 for i in range(5): if download_path != None: path = '%s/Comics/%s/' %(download_path, self.keyword) if not BaseRequest.DownloadData(a_img['src'], path, "封面.jpg"): print("download %s failed %d time" % ("封面.jpg", i)) else: print("download %s%s success" % (path,"封面.jpg")) break src = "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Comics/" + self.keyword + "/" + "封面.jpg" #将漫画信息存储到数据库 sql_dict = collections.OrderedDict() sql_dict['Name'] = "\"" + data['title'] + "\"" #名字 sql_dict['WatchNum'] = 0 #编号 sql_dict['Website'] = "\"" + self._url + "\"" #网址 sql_dict['ChapterNum']= len(a_book) #总共有多少章节 sql_dict['IsFinish'] = 0 #是否完结 sql_dict['Introduce'] = "\"" + a_introduce.p.contents[0].replace('\"', '') + "\"" #漫画介绍 sql_dict['Author'] = "\"" + a_author.contents[0] + "\"" #作者 sql_dict['Img'] = "\"" + src + "\"" #封面图片 sql_dict['Type'] = "\"" + self.type + "\"" #漫画类型 sql_dict['Time'] = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 if not self._EntertainmentInsert('ComicName', sql_dict): print("inster ComicName table failed!") continue #获取漫画编号,唯一 sql = "SELECT ID FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data['title']) max_id = self._EntertainmentSelect(sql) if max_id: self.id = max_id[0][0] else: print("get max_id failed!") continue elif mode == "update": now_Time = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 sql = "update EntertainmentDB.ComicName set Time = %s where ID = %d;" %(now_Time, self.id) if not self._EntertainmentUpdate(sql): print("%s update failed!" %(sql)) count = 1 for book in reversed(a_book): href = book['href'] title = book['title'] src = book.img['src'] #当前章节的内容插入到队列中 url_a_book = self._url + href data = {"url": url_a_book, "title":title, "src": src, "href":href, "count": count} if mode == "download": dic_queue = {"type": "download", "subtype": "download", "self":self, "data":data} elif mode == "update": dic_queue = {"type": "download", "subtype": "update", "self":self, "data":data} priority_queue.put(base.Job(2,dic_queue,self._url)) count += 1 p.spawn(run) p.join() return True
def _UpdataChapter(self, result, download_path=None): """更新最新章节,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 Returns ------- success: dict[list]--self.kkmh_content failed : None """ keyword = result[1] chapter_num = result[4] self.id = result[0] #请求keyword网页 self.keyword = keyword self.download_path = download_path url_keyword = self._url + '/web/topic/search?keyword' + parse.urlencode({"": keyword}) content_keyword = BaseRequest.GetUrlContent(url_keyword) if content_keyword == False: return None #将返回的内容解析 content_keyword_json = json.loads(content_keyword.decode("utf8")) if content_keyword_json == False: return None #取出id关键字,从而访问搜索到的内容 for data in content_keyword_json['data']['topic']: #获取漫画编号,唯一 if data['title'] != keyword: continue url_keyword_content = self._url + '/web/topic/' + str(data['id']) soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content) if soup_keyword_content == False: return None #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环 a_book = soup_keyword_content.find_all('a',{'class':'article-img'}) now_chapter_num = len(a_book) for book in a_book: print(now_chapter_num, chapter_num) if now_chapter_num <= chapter_num: return None href = book['href'] title = book['title'] lst_img_book = [] dct_img_book = {} #下载当前章节的内容 url_a_book = self._url + href soup_a_book = BaseRequest.GetUrlSoup(url_a_book) if soup_a_book == None: return None #找到每一章节的图片地址并保存 content_img_book = soup_a_book.find_all('img',{'class':'kklazy', 'title':title}) for img_book in content_img_book: lst_img_book.append(img_book['data-kksrc'].replace('amp;', '')) #将数据存储到结构体中,用于后续保存 dct_img_book = {'href':href, 'title':title, 'chapter':now_chapter_num, 'download_url':lst_img_book} self.lst_kkmh_content.append(dct_img_book) now_chapter_num = now_chapter_num - 1 yield dct_img_book
def _GetContentByKeyword(self, keyword): """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中 Parameters ---------- keyword : str or unicode 搜索文字 Returns ------- success: dict[list]--self.kkmh_content failed : None """ #请求keyword网页 self.keyword = keyword url_keyword = self._url + '/web/topic/search?keyword' + parse.urlencode({"": keyword}) content_keyword = BaseRequest.GetUrlContent(url_keyword) if content_keyword == False: return None #将返回的内容解析 content_keyword_json = json.loads(content_keyword.decode("utf8")) if content_keyword_json == False: return None #取出id关键字,从而访问搜索到的内容 url_keyword_content = self._url + '/web/topic/' + str(content_keyword_json['data']['topic'][0]['id']) soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content) if soup_keyword_content == False: return None #将漫画信息存储到数据库 sql_dict = collections.OrderedDict() sql_dict['Name'] = "\"" + self.keyword + "\"" #名字 sql_dict['Num'] = 0 #编号 sql_dict['Website'] = "\"" + self._url + "\"" #网址 sql_dict['Time'] = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间 if not self._ComicInsert('ComicName', sql_dict): print("inster ComicName table failed!") #获取漫画编号,唯一 sql = "SELECT ID FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(self.keyword) max_id = self._ComicSelect(sql) self.id = max_id[0][0] #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环 a_book = soup_keyword_content.find_all('a',{'class':'article-img'}) for book in reversed(a_book): href = book['href'] title = book['title'] src = book.img['src'] lst_img_book = [] dct_img_book = {} #下载当前章节的内容 url_a_book = self._url + href soup_a_book = BaseRequest.GetUrlSoup(url_a_book) if soup_a_book == None: return None #找到每一章节的图片地址并保存 content_img_book = soup_a_book.find_all('img',{'class':'kklazy', 'title':title}) for img_book in content_img_book: lst_img_book.append(img_book['data-kksrc'].replace('amp;', '')) #将数据存储到结构体中,用于后续保存 dct_img_book = {'href':href, 'title':title, 'src':src, 'download_url':lst_img_book} self.lst_kkmh_content.append(dct_img_book) yield dct_img_book