Ejemplo n.º 1
0
    def _UpdataChapter(self, result, download_path=None):
        """更新最新章节,然后将返回的内容记录在kkmh_content结构中

        Parameters
        ----------
        keyword : str or unicode
            搜索文字
        Returns
        -------
        success: dict[list]--self.kkmh_content
        failed : None
        """

        keyword     = result[1]
        chapter_num = result[4]
        self.id     = result[0]

        #请求keyword网页
        self.keyword       = keyword
        self.download_path = download_path

        url_keyword        = self._url + '/e/search/'

        keyword_encode = keyword.encode('gbk','strict');
        button_encode  = "搜索漫画".encode('gbk','strict');
        params = {  
            'key':keyword_encode,  
            'button':button_encode,  
        }
        params = parse.urlencode(params).encode("gbk")
        content_keyword = BaseRequest.PostUrlSoup(url_keyword, params, 'gbk')
        if content_keyword == None:
            return None

        a_result = content_keyword.find_all('p',{'class':'fl cover'})

        #取出id关键字,从而访问搜索到的内容
        for data in a_result:
            #获取漫画编号,唯一
            
            if data.a.img['alt'] != keyword:
                continue

            url_keyword_content  = self._url + "/" + data.a['href']
            soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content, 'gbk')
            if soup_keyword_content == None:
                return None

            #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环
            book = soup_keyword_content.find('div',{'class':'plist pnormal','id':'play_0'})
            a_book = []
            for data_content in book.ul:
                a = data_content.find('a')
                if a != None and a != -1:
                    a_book.append(a)

            now_chapter_num = len(a_book)
            for book in a_book:
                print(now_chapter_num, chapter_num)
                if now_chapter_num <= chapter_num:
                    return None
                
                href  = book['href']
                title = book['title']
                lst_img_book = []
                dct_img_book = {}

                title = title.replace(' ','')
                #下载当前章节的内容
                url_a_book  = self._url + href

                soup_a_book = BaseRequest.GetUrlSoup(url_a_book, 'gbk')
                if soup_a_book == None:
                    return None

                url_list = url_a_book.split("/")
                comic_id = url_list[-2]
                chapter_id = url_list[-1][0:-5]
                
                for i in range(20):
                    download_url   = "http://mhpic5.lineinfo.cn/mh160tuku/s/"
                    keyword_encode = parse.urlencode({"": self.keyword})
                    title_encode   = parse.urlencode({"": title})

                    name = ''
                    if (i+1) < 10:
                        name = '/000' + str(i+1) + ".jpg"
                    elif (i+1) >= 10 and (i+1) < 100:
                        name = '/00' + str(i+1)  + ".jpg"
                    else:
                        name = '/0' + str(i+1)  + ".jpg"

                    download_url = download_url + keyword_encode[1:len(keyword_encode)] + "_" + comic_id + "/" + title_encode[1:len(title_encode)] + "_" + chapter_id + name
                    lst_img_book.append(download_url)


                #将数据存储到结构体中,用于后续保存
                dct_img_book = {'href':href, 'title':title, 'chapter':now_chapter_num, 'download_url':lst_img_book}
                self.lst_kkmh_content.append(dct_img_book)

                now_chapter_num = now_chapter_num - 1
                
                yield dct_img_book
Ejemplo n.º 2
0
    def _UpdataChapter(self, result, download_path=None):
        """更新最新章节,然后将返回的内容记录在kkmh_content结构中

        Parameters
        ----------
        keyword : str or unicode
            搜索文字
        Returns
        -------
        success: dict[list]--self.kkmh_content
        failed : None
        """

        keyword     = result[1]
        chapter_num = result[4]
        self.id     = result[0]

        #请求keyword网页
        self.keyword       = keyword
        self.download_path = download_path

        url_keyword        = self._url + '/search.html'
        keyword_encode = keyword.encode('big5','strict');
        params = {  
            'keyword':keyword_encode,  
            'searchtype':'all',  
        }
        params = parse.urlencode(params).encode("big5")

        content_keyword = BaseRequest.PostUrlSoup(url_keyword, params, 'big5')
        if content_keyword == None:
            return None

        a_result = content_keyword.find_all('span',{'class':'covertxt'})

        #取出id关键字,从而访问搜索到的内容
        for data in a_result:
            data_next_siblings = data.find_next_siblings()

            print(data_next_siblings[0]['title'], keyword)
            if data_next_siblings[0]['title'] != keyword:
                continue

            url_keyword_content  = self._url + "/" + data_next_siblings[0]['href']
            soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content, 'big5')
            if soup_keyword_content == None:
                return None

            #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环
            save_content = soup_keyword_content.find_all('table',{'width':'688', 'align':'center'})
            if save_content == None:
                return None

            a_book = []
            for data_content in save_content[0].tbody:
                for data_td in data_content:
                    a = data_td.find('a')
                    if a != None and a != -1:
                        a_book.append(a)

            now_chapter_num = len(a_book)
            for book in reversed(a_book):
                print(now_chapter_num, chapter_num)
                if now_chapter_num <= chapter_num:
                    return None

                href  = book['href']
                title = book.contents[0]
                lst_img_book = []
                dct_img_book = {}

                #下载当前章节的内容
                url_a_book  = self._url + href
                soup_a_book = BaseRequest.GetUrlSoup(url_a_book, 'big5')
                if soup_a_book == None:
                    return None

                content_img_book = soup_a_book.find_all('img',{'oncontextmenu':'return false'})
                img_num = soup_a_book.find_all('option', value=True)
                for num in range(len(img_num)):
                    img = content_img_book[0]['src']
                    img = img[0:(len(img)-7)]

                    if (num+1) < 10:
                        img = img + '00' + str(num+1) + ".jpg"
                    elif (num+1) >= 10 and (num+1) < 100:
                        img = img + '0' + str(num+1)  + ".jpg"
                    else:
                        img = img + str(num+1)  + ".jpg"

                    lst_img_book.append(img)



                #将数据存储到结构体中,用于后续保存
                dct_img_book = {'href':href, 'title':title, 'chapter':now_chapter_num, 'download_url':lst_img_book}
                self.lst_kkmh_content.append(dct_img_book)

                now_chapter_num = now_chapter_num - 1
                
                yield dct_img_book
Ejemplo n.º 3
0
    def _GetContentByKeyword(self, keyword, mode, download_path=None):
        """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中

        Parameters
        ----------
        keyword : str or unicode
            搜索文字

        mode    : str or unicode 
            download : 下载
            updata   :更新图片
        Returns
        -------
        success: dict[list]--self.kkmh_content
        failed : None
        """

        #请求keyword网页
        self.keyword       = keyword
        self.download_path = download_path
        url_keyword        = self._url + '/e/search/'

        keyword_encode = keyword.encode('gbk','strict');
        button_encode  = "搜索漫画".encode('gbk','strict');
        params = {  
            'key':keyword_encode,  
            'button':button_encode,  
        }
        params = parse.urlencode(params).encode("gbk")
        content_keyword = BaseRequest.PostUrlSoup(url_keyword, params, 'gbk')
        if content_keyword == None:
            return False

        a_result = content_keyword.find_all('p',{'class':'fl cover'})
        #取出id关键字,从而访问搜索到的内容
        for data in a_result:
            
            if mode == "download":
                #判断此漫画是否已经下载过
                sql = "SELECT * FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data.a.img['alt'])
                if self._EntertainmentSelect(sql):
                    print("%s 已经下载过,请查看数据库" % data.a.img['alt'] )
                    continue
            
            #等待上一部漫画下载完成   
            while True:
                if not priority_queue.empty():
                    print("threads conunt :%d" %threading.active_count())
                    print("queue size : %d" %(priority_queue.qsize()))
                    if threading.active_count() < 10:
                        StartComicThread(10)  
                    time.sleep(60)
                    continue
                else:
                    break
            

            self.keyword         = data.a.img['alt']
            print(self.keyword)
            url_keyword_content  = self._url + "/" + data.a['href']
            soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content, 'gbk')
            if soup_keyword_content == None:
                return False

            #将漫画信息存储到数据库
            sql_dict = collections.OrderedDict()
            sql_dict['Name']      = "\"" + self.keyword + "\""          #名字
            sql_dict['WatchNum']  = 0                                   #编号  
            sql_dict['Website']   = "\"" + self._url + "\""             #网址

            #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环
            book = soup_keyword_content.find('div',{'class':'plist pnormal','id':'play_0'})
            a_book = []
            for data_content in book.ul:
                a = data_content.find('a')
                if a != None and a != -1:
                    a_book.append(a)

            if mode == "download":

                a_author    = soup_keyword_content.find('meta', {'property':'og:novel:author'})
                a_category  = soup_keyword_content.find('meta', {'property':'og:novel:category'})
                a_img       = soup_keyword_content.find('meta', {'property':'og:image'})
                a_introduce = soup_keyword_content.find('p', {'id':'intro'})
                IsFinish    = soup_keyword_content.find('meta', {'property':'og:novel:status'})
                if (IsFinish['content'] == '连载中'):
                    a_isfinish = 0
                else:
                    a_isfinish = 1
                
                #下载漫画封面
                for i in range(5):
                    if download_path != None:
                        path = '%s/Comics/%s/' %(download_path, self.keyword)
                        if not BaseRequest.DownloadData(a_img['content'], path, "封面.jpg"):
                            print("download %s failed %d time" % ("封面.jpg", i))
                        else:
                            print("download %s%s success" % (path,"封面.jpg"))
                            break
                src = "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Comics/" + self.keyword + "/" + "封面.jpg"

                #将漫画信息存储到数据库
                sql_dict = collections.OrderedDict()
                sql_dict['Name']      = "\"" + self.keyword + "\""          #名字
                sql_dict['WatchNum']  = 0                                   #编号  
                sql_dict['Website']   = "\"" + self._url + "\""             #网址
                sql_dict['ChapterNum']= len(a_book)                         #总共有多少章节
                sql_dict['IsFinish']  = a_isfinish                          #是否完结
                sql_dict['Introduce'] = "\"" + a_introduce.a.contents[0] + "\""   #漫画介绍
                sql_dict['Author']    = "\"" + a_author['content'] + "\""   #作者
                sql_dict['Img']       = "\"" + src + "\""                   #封面图片
                sql_dict['Type']      = "\"" + a_category['content'] + "\""             #漫画类型
                sql_dict['Time']      = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间
                
                if not self._EntertainmentInsert('ComicName', sql_dict):
                    print("inster ComicName table failed!")
                    continue

                #获取漫画编号,唯一
                sql = "SELECT ID FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data.a.img['alt'])
                max_id = self._EntertainmentSelect(sql)
                if max_id:
                    self.id = max_id[0][0]
                else:
                    print("get max_id failed!")
                    continue
                
            elif mode == "update":
                now_Time = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间
                sql = "update EntertainmentDB.ComicName set Time = %s  where ID = %d;" %(now_Time, self.id)
                if not self._EntertainmentUpdate(sql):
                    print("%s update failed!" %(sql))

            count = 1
            for book in reversed(a_book):
                href  = book['href']
                title = book['title']

                #当前章节的内容插入到队列中
                url_a_book  = self._url + href

                data = {"url": url_a_book, "title":title, "href":href, "count": count}
                if mode == "download":
                    dic_queue = {"type": "download", "subtype": "download", "self":self, "data":data}
                elif mode == "update":
                    dic_queue = {"type": "download", "subtype": "update", "self":self, "data":data}

                priority_queue.put(base.Job(2,dic_queue,self._url))

                count += 1

        return True
Ejemplo n.º 4
0
    def _GetContentByKeyword(self, keyword, mode, download_path=None):
        """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中

        Parameters
        ----------
        keyword : str or unicode
            搜索文字

        mode    : str or unicode 
            download : 下载
            updata   :更新图片
        Returns
        -------
        success: dict[list]--self.kkmh_content
        failed : None
        """

        #请求keyword网页
        self.keyword       = keyword
        self.download_path = download_path
        url_keyword        = self._url + '/search.html'

        keyword_encode = keyword.encode('big5','strict');
        params = {  
            'keyword':keyword_encode,  
            'searchtype':'all',  
        }
        params = parse.urlencode(params).encode("big5")

        content_keyword = BaseRequest.PostUrlSoup(url_keyword, params, 'big5')
        if content_keyword == None:
            return False

        a_result = content_keyword.find_all('span',{'class':'covertxt'})

        #取出id关键字,从而访问搜索到的内容
        for data in a_result:
            data_next_siblings = data.find_next_siblings()
            
            if mode == "download":
                #判断此漫画是否已经下载过
                sql = "SELECT * FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data_next_siblings[0]['title'])
                if self._EntertainmentSelect(sql):
                    print(data_next_siblings[0]['title'])
                    continue
            
            #等待上一部漫画下载完成   
            while True:
                if not priority_queue.empty():
                    print("threads conunt :%d" %threading.active_count())
                    print("queue size : %d" %(priority_queue.qsize()))
                    if threading.active_count() < 10:
                        StartComicThread(10)  
                    time.sleep(60)
                    continue
                else:
                    break
            

            self.keyword         = data_next_siblings[0]['title']
            print(self.keyword)
            url_keyword_content  = self._url + "/" + data_next_siblings[0]['href']
            soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content, 'big5')
            if soup_keyword_content == None:
                return False

            #将漫画信息存储到数据库
            sql_dict = collections.OrderedDict()
            sql_dict['Name']      = "\"" + self.keyword + "\""          #名字
            sql_dict['WatchNum']  = 0                                   #编号  
            sql_dict['Website']   = "\"" + self._url + "\""             #网址

            save_content = soup_keyword_content.find_all('td',{'width':276})
            if save_content == None:
                return False

            sql_dict['Type']     = "\"" + save_content[1].a.contents[0].strip() + "\""
            sql_dict['Author']   = "\"" + save_content[3].contents[1].strip() + "\""
            a_IsFinish = 0
            if save_content[5].contents[4]['src'].strip() == "/image/chap1.gif":
                sql_dict['IsFinish'] = 0
            elif save_content[5].contents[4]['src'].strip() == "/image/chap9.gif":
                sql_dict['IsFinish'] = 1
            else:
                sql_dict['IsFinish'] = 0

            save_content = soup_keyword_content.find_all('table',{'width':688,'cellspacing':"8"})
            if save_content != None:
                sql_dict['Introduce'] = "\"" + save_content[0].tr.td.contents[0].strip() + "\""
            else:
                sql_dict['Introduce'] = ''

            save_content = soup_keyword_content.find_all('img',{'width':'240','height':'320'})
            a_img = ''
            if save_content != None:
                a_img = self._url + save_content[0]['src']
                

            #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环
            save_content = soup_keyword_content.find_all('table',{'width':'688', 'align':'center'})
            if save_content == None:
                return False

            a_book = []
            for data_content in save_content[0].tbody:
                for data_td in data_content:
                    a = data_td.find('a')
                    if a != None and a != -1:
                        a_book.append(a)

            if mode == "download":

                #下载漫画封面
                for i in range(5):
                    if download_path != None:
                        path = '%s/Comics/%s/' %(download_path, self.keyword)
                        if not BaseRequest.DownloadData(a_img, path, "封面.jpg"):
                            print("download %s failed %d time" % ("封面.jpg", i))
                        else:
                            print("download %s%s success" % (path,"封面.jpg"))
                            break

                src = "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Comics/" + self.keyword + "/" + "封面.jpg"

                #将漫画信息存储到数据库
                sql_dict['Img']       = "\"" + src + "\""
                sql_dict['ChapterNum']= len(a_book)                         #总共有多少章节
                sql_dict['Time']      = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间

                
                if not self._EntertainmentInsert('ComicName', sql_dict):
                    print("inster ComicName table failed!")
                    continue

                #获取漫画编号,唯一
                sql = "SELECT ID FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(self.keyword)
                max_id = self._EntertainmentSelect(sql)
                if max_id:
                    self.id = max_id[0][0]
                else:
                    print("get max_id failed!")
                    continue
                
            elif mode == "update":
                now_Time = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间
                sql = "update EntertainmentDB.ComicName set Time = %s  where ID = %d;" %(now_Time, self.id)
                if not self._EntertainmentUpdate(sql):
                    print("%s update failed!" %(sql))

            count = 1
            for book in (a_book):
                href  = book['href']
                title = book.contents[0]

                #当前章节的内容插入到队列中
                url_a_book  = self._url + href

                data = {"url": url_a_book, "title":title, "href":href, "count": count}
                if mode == "download":
                    dic_queue = {"type": "download", "subtype": "download", "self":self, "data":data}
                elif mode == "update":
                    dic_queue = {"type": "download", "subtype": "update", "self":self, "data":data}

                priority_queue.put(base.Job(2,dic_queue,self._url))

                count += 1

        return True