Ejemplo n.º 1
0
    def GetComicByKeyword(self):
        """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中

        Parameters
        ----------
        keyword : str or unicode
            搜索文字

        url     : str or unicode
            要从那个网址下载

        download_path : str or unicode
            文件要保存的何处,默认为None

        mode    : str or unicode 
            download : 下载
            updata   :更新图片
        Returns
        -------
        success: dict[list]--self.kkmh_content
        failed : None
        """
        download_lst = [
            {
                "name": '西行纪',
                "url": "http://www.kuaikanmanhua.com",
                "download": "/mnt/TecentCloud"
            },
        ]
        print(download_lst)
        for data in download_lst:
            while True:
                if not priority_queue.empty():
                    print("threads conunt :%d" % threading.active_count())
                    print("queue size : %d" % (priority_queue.qsize()))
                    if threading.active_count() < 10:
                        StartComicThread(10)
                    time.sleep(60)
                    continue
                else:
                    break

            print("%s start download" % (data['name']))
            print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

            #生成漫画类句柄
            self.GetComicHandle(data['url'])

            #设置网址等信息
            self.ComicHandle._set_info(data['url'], None, None)

            #通过关键字下载
            if not self.ComicHandle._GetContentByKeyword(
                    data['name'], "download", data['download']):
                print("Download %s failed!" % (data['name']))
Ejemplo n.º 2
0
    def UpdateComicPicture(self, download_path=None):
        """图片地址有时会失效,故需要更新图片

        Parameters
        ----------
        download_path : str or unicode
            文件要保存的何处,默认为None

        Returns
        -------
        success: dict[list]--self.kkmh_content
        failed : None
        """

        #查找所有未完结的漫画
        results = True

        #获取当前数据库里的所有漫画
        ComicHandle = EntertainmentSpider()
        sql = "SELECT * FROM EntertainmentDB.ComicName;"
        results_tup = ComicHandle._ComicSelect(sql)

        for result in results_tup:
            keyword        = result[1]
            url            = result[5]

            #生成漫画类句柄
            self.GetComicHandle(url)

            #漫画ID
            self.ComicHandle.id = result[0]

            #设置网址等信息
            self.ComicHandle._set_info(url, None, None)

            #如果队列不为空,则说明当前漫画还没有处理完成,等待完成后在下载下一部漫画
            while True:
                if not priority_queue.empty():
                    print("queue size : %d" %(priority_queue.qsize()))
                    time.sleep(5)
                    continue
                else:
                    break

            #更新数据库
            if not self.ComicHandle._GetContentByKeyword(keyword, "update"):
                print("Download %s failed!" %(keyword))
                result = False


        return results
Ejemplo n.º 3
0
        #启动线程
        print("begin StartThread")
        StartFictionThread(20)

        #下载模式
        if sys.argv[2] == "DownloadAll":
            EntertainmentAPi.GetFictionByKeyword()

        #更新模式,增加新的章节
        elif sys.argv[2] == "UpdateChapter":
            EntertainmentAPi.UpdateFictionChapter("/mnt/TecentCloud")

    while True:
        if not priority_queue.empty():
            print("threads conunt :%d" %threading.active_count())
            print("queue size : %d" %(priority_queue.qsize()))
            time.sleep(5)
            continue
        else:
            break

    
    #关闭所有线程
    #for t in threads:
    #    t.join()

    print('finish: ', now() - start)
    print("download finish")
    #if not EntertainmentAPi.SaveToDatabase('/home/txz/download',dct_img_book):
    #    print("download failed!")
    #EntertainmentAPi.ParseContent(content)
Ejemplo n.º 4
0
    def _GetContentByKeyword(self, keyword, mode, download_path=None):
        """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中

        Parameters
        ----------
        keyword : str or unicode
            搜索文字

        mode    : str or unicode 
            download : 下载
            updata   :更新图片
        Returns
        -------
        success: dict[list]--self.kkmh_content
        failed : None
        """

        #请求keyword网页
        self.keyword       = keyword
        self.download_path = download_path
        url_keyword        = self._url + '/e/search/'

        keyword_encode = keyword.encode('gbk','strict');
        button_encode  = "搜索漫画".encode('gbk','strict');
        params = {  
            'key':keyword_encode,  
            'button':button_encode,  
        }
        params = parse.urlencode(params).encode("gbk")
        content_keyword = BaseRequest.PostUrlSoup(url_keyword, params, 'gbk')
        if content_keyword == None:
            return False

        a_result = content_keyword.find_all('p',{'class':'fl cover'})
        #取出id关键字,从而访问搜索到的内容
        for data in a_result:
            
            if mode == "download":
                #判断此漫画是否已经下载过
                sql = "SELECT * FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data.a.img['alt'])
                if self._EntertainmentSelect(sql):
                    print("%s 已经下载过,请查看数据库" % data.a.img['alt'] )
                    continue
            
            #等待上一部漫画下载完成   
            while True:
                if not priority_queue.empty():
                    print("threads conunt :%d" %threading.active_count())
                    print("queue size : %d" %(priority_queue.qsize()))
                    if threading.active_count() < 10:
                        StartComicThread(10)  
                    time.sleep(60)
                    continue
                else:
                    break
            

            self.keyword         = data.a.img['alt']
            print(self.keyword)
            url_keyword_content  = self._url + "/" + data.a['href']
            soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content, 'gbk')
            if soup_keyword_content == None:
                return False

            #将漫画信息存储到数据库
            sql_dict = collections.OrderedDict()
            sql_dict['Name']      = "\"" + self.keyword + "\""          #名字
            sql_dict['WatchNum']  = 0                                   #编号  
            sql_dict['Website']   = "\"" + self._url + "\""             #网址

            #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环
            book = soup_keyword_content.find('div',{'class':'plist pnormal','id':'play_0'})
            a_book = []
            for data_content in book.ul:
                a = data_content.find('a')
                if a != None and a != -1:
                    a_book.append(a)

            if mode == "download":

                a_author    = soup_keyword_content.find('meta', {'property':'og:novel:author'})
                a_category  = soup_keyword_content.find('meta', {'property':'og:novel:category'})
                a_img       = soup_keyword_content.find('meta', {'property':'og:image'})
                a_introduce = soup_keyword_content.find('p', {'id':'intro'})
                IsFinish    = soup_keyword_content.find('meta', {'property':'og:novel:status'})
                if (IsFinish['content'] == '连载中'):
                    a_isfinish = 0
                else:
                    a_isfinish = 1
                
                #下载漫画封面
                for i in range(5):
                    if download_path != None:
                        path = '%s/Comics/%s/' %(download_path, self.keyword)
                        if not BaseRequest.DownloadData(a_img['content'], path, "封面.jpg"):
                            print("download %s failed %d time" % ("封面.jpg", i))
                        else:
                            print("download %s%s success" % (path,"封面.jpg"))
                            break
                src = "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Comics/" + self.keyword + "/" + "封面.jpg"

                #将漫画信息存储到数据库
                sql_dict = collections.OrderedDict()
                sql_dict['Name']      = "\"" + self.keyword + "\""          #名字
                sql_dict['WatchNum']  = 0                                   #编号  
                sql_dict['Website']   = "\"" + self._url + "\""             #网址
                sql_dict['ChapterNum']= len(a_book)                         #总共有多少章节
                sql_dict['IsFinish']  = a_isfinish                          #是否完结
                sql_dict['Introduce'] = "\"" + a_introduce.a.contents[0] + "\""   #漫画介绍
                sql_dict['Author']    = "\"" + a_author['content'] + "\""   #作者
                sql_dict['Img']       = "\"" + src + "\""                   #封面图片
                sql_dict['Type']      = "\"" + a_category['content'] + "\""             #漫画类型
                sql_dict['Time']      = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间
                
                if not self._EntertainmentInsert('ComicName', sql_dict):
                    print("inster ComicName table failed!")
                    continue

                #获取漫画编号,唯一
                sql = "SELECT ID FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data.a.img['alt'])
                max_id = self._EntertainmentSelect(sql)
                if max_id:
                    self.id = max_id[0][0]
                else:
                    print("get max_id failed!")
                    continue
                
            elif mode == "update":
                now_Time = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间
                sql = "update EntertainmentDB.ComicName set Time = %s  where ID = %d;" %(now_Time, self.id)
                if not self._EntertainmentUpdate(sql):
                    print("%s update failed!" %(sql))

            count = 1
            for book in reversed(a_book):
                href  = book['href']
                title = book['title']

                #当前章节的内容插入到队列中
                url_a_book  = self._url + href

                data = {"url": url_a_book, "title":title, "href":href, "count": count}
                if mode == "download":
                    dic_queue = {"type": "download", "subtype": "download", "self":self, "data":data}
                elif mode == "update":
                    dic_queue = {"type": "download", "subtype": "update", "self":self, "data":data}

                priority_queue.put(base.Job(2,dic_queue,self._url))

                count += 1

        return True
Ejemplo n.º 5
0
    def _GetContentByKeyword(self, keyword, mode, download_path=None):
        """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中

        Parameters
        ----------
        keyword : str or unicode
            搜索文字

        mode    : str or unicode 
            download : 下载
            updata   :更新图片
        Returns
        -------
        success: dict[list]--self.kkmh_content
        failed : None
        """

        #请求keyword网页
        self.keyword       = keyword
        self.download_path = download_path
        url_keyword        = self._url + '/search.html'

        keyword_encode = keyword.encode('big5','strict');
        params = {  
            'keyword':keyword_encode,  
            'searchtype':'all',  
        }
        params = parse.urlencode(params).encode("big5")

        content_keyword = BaseRequest.PostUrlSoup(url_keyword, params, 'big5')
        if content_keyword == None:
            return False

        a_result = content_keyword.find_all('span',{'class':'covertxt'})

        #取出id关键字,从而访问搜索到的内容
        for data in a_result:
            data_next_siblings = data.find_next_siblings()
            
            if mode == "download":
                #判断此漫画是否已经下载过
                sql = "SELECT * FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data_next_siblings[0]['title'])
                if self._EntertainmentSelect(sql):
                    print(data_next_siblings[0]['title'])
                    continue
            
            #等待上一部漫画下载完成   
            while True:
                if not priority_queue.empty():
                    print("threads conunt :%d" %threading.active_count())
                    print("queue size : %d" %(priority_queue.qsize()))
                    if threading.active_count() < 10:
                        StartComicThread(10)  
                    time.sleep(60)
                    continue
                else:
                    break
            

            self.keyword         = data_next_siblings[0]['title']
            print(self.keyword)
            url_keyword_content  = self._url + "/" + data_next_siblings[0]['href']
            soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content, 'big5')
            if soup_keyword_content == None:
                return False

            #将漫画信息存储到数据库
            sql_dict = collections.OrderedDict()
            sql_dict['Name']      = "\"" + self.keyword + "\""          #名字
            sql_dict['WatchNum']  = 0                                   #编号  
            sql_dict['Website']   = "\"" + self._url + "\""             #网址

            save_content = soup_keyword_content.find_all('td',{'width':276})
            if save_content == None:
                return False

            sql_dict['Type']     = "\"" + save_content[1].a.contents[0].strip() + "\""
            sql_dict['Author']   = "\"" + save_content[3].contents[1].strip() + "\""
            a_IsFinish = 0
            if save_content[5].contents[4]['src'].strip() == "/image/chap1.gif":
                sql_dict['IsFinish'] = 0
            elif save_content[5].contents[4]['src'].strip() == "/image/chap9.gif":
                sql_dict['IsFinish'] = 1
            else:
                sql_dict['IsFinish'] = 0

            save_content = soup_keyword_content.find_all('table',{'width':688,'cellspacing':"8"})
            if save_content != None:
                sql_dict['Introduce'] = "\"" + save_content[0].tr.td.contents[0].strip() + "\""
            else:
                sql_dict['Introduce'] = ''

            save_content = soup_keyword_content.find_all('img',{'width':'240','height':'320'})
            a_img = ''
            if save_content != None:
                a_img = self._url + save_content[0]['src']
                

            #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环
            save_content = soup_keyword_content.find_all('table',{'width':'688', 'align':'center'})
            if save_content == None:
                return False

            a_book = []
            for data_content in save_content[0].tbody:
                for data_td in data_content:
                    a = data_td.find('a')
                    if a != None and a != -1:
                        a_book.append(a)

            if mode == "download":

                #下载漫画封面
                for i in range(5):
                    if download_path != None:
                        path = '%s/Comics/%s/' %(download_path, self.keyword)
                        if not BaseRequest.DownloadData(a_img, path, "封面.jpg"):
                            print("download %s failed %d time" % ("封面.jpg", i))
                        else:
                            print("download %s%s success" % (path,"封面.jpg"))
                            break

                src = "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Comics/" + self.keyword + "/" + "封面.jpg"

                #将漫画信息存储到数据库
                sql_dict['Img']       = "\"" + src + "\""
                sql_dict['ChapterNum']= len(a_book)                         #总共有多少章节
                sql_dict['Time']      = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间

                
                if not self._EntertainmentInsert('ComicName', sql_dict):
                    print("inster ComicName table failed!")
                    continue

                #获取漫画编号,唯一
                sql = "SELECT ID FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(self.keyword)
                max_id = self._EntertainmentSelect(sql)
                if max_id:
                    self.id = max_id[0][0]
                else:
                    print("get max_id failed!")
                    continue
                
            elif mode == "update":
                now_Time = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间
                sql = "update EntertainmentDB.ComicName set Time = %s  where ID = %d;" %(now_Time, self.id)
                if not self._EntertainmentUpdate(sql):
                    print("%s update failed!" %(sql))

            count = 1
            for book in (a_book):
                href  = book['href']
                title = book.contents[0]

                #当前章节的内容插入到队列中
                url_a_book  = self._url + href

                data = {"url": url_a_book, "title":title, "href":href, "count": count}
                if mode == "download":
                    dic_queue = {"type": "download", "subtype": "download", "self":self, "data":data}
                elif mode == "update":
                    dic_queue = {"type": "download", "subtype": "update", "self":self, "data":data}

                priority_queue.put(base.Job(2,dic_queue,self._url))

                count += 1

        return True
Ejemplo n.º 6
0
    elif sys.argv[1] == "Fiction":
        #启动线程
        print("begin StartThread")
        StartFictionThread(20)

        #下载模式
        if sys.argv[2] == "DownloadAll":
            EntertainmentAPi.GetFictionByKeyword()

        #更新模式,增加新的章节
        elif sys.argv[2] == "UpdateChapter":
            EntertainmentAPi.UpdateFictionChapter("/mnt/TecentCloud")

    if sys.argv[2] == "DownloadAll":
        num = priority_queue.qsize()

        size = 0
        count = 1

        processpool = []
        task = []
        for count in range(num + 1):

            if not priority_queue.empty():
                task.append(priority_queue.get())
                if (count % 115 == 0 or count == num) and count != 0:
                    print("size:", size)

                    process = multiprocessing.Process(target=ComicProcesses,
                                                      args=(task, ))
Ejemplo n.º 7
0
    def _GetContentByKeyword(self, keyword, mode, download_path=None):
        """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中

        Parameters
        ----------
        keyword : str or unicode
            搜索文字

        mode    : str or unicode 
            download : 下载
            updata   :更新图片
        Returns
        -------
        success: dict[list]--self.kkmh_content
        failed : None
        """

        #请求keyword网页
        self.keyword = keyword
        self.download_path = download_path

        url_keyword = self._url + '/modules/article/soshu.php?searchkey=' + parse.quote(
            keyword, encoding='gbk', errors='replace')
        content_keyword = BaseRequest.GetUrlSoup(url_keyword, 'gbk')
        if content_keyword == None:
            return False

        #将返回的内容解析
        find_result = []
        if content_keyword.find('caption'):
            a_result = content_keyword.find_all('tr', {'id': 'nr'})
            if a_result == None:
                return False

            for result in a_result:
                find_result.append({
                    "name": result.td.a.contents[0],
                    "url": result.td.a['href']
                })
        else:
            a_url = content_keyword.find('meta', {'property': 'og:url'})
            if a_url == None:
                return False

            a_name = content_keyword.find('meta',
                                          {'property': 'og:novel:book_name'})
            if a_name == None:
                return False
            find_result.append({
                "name": a_name["content"],
                "url": a_url['content']
            })

        for result in find_result:
            if mode == "download":
                #判断此漫画是否已经下载过
                sql = "SELECT * FROM EntertainmentDB.tbl_fiction_name WHERE name=\"%s\";" % (
                    result["name"])
                if self._EntertainmentSelect(sql):
                    print(result["name"])
                    continue

                #等待上一部小说下载完成
                while True:
                    if not priority_queue.empty():
                        print("threads conunt :%d" % threading.active_count())
                        print("queue size : %d" % (priority_queue.qsize()))
                        if threading.active_count() < 10:
                            StartFictionThread(10)
                        time.sleep(60)
                        continue
                    else:
                        break

                self.keyword = result["name"]
                soup_keyword_content = BaseRequest.GetUrlSoup(
                    result["url"], 'gbk')
                if soup_keyword_content == None:
                    return False

                a_name = soup_keyword_content.find(
                    'meta', {'property': 'og:novel:book_name'})
                a_introduce = soup_keyword_content.find(
                    'meta', {'property': 'og:description'})
                a_image = soup_keyword_content.find('meta',
                                                    {'property': 'og:image'})
                a_category = soup_keyword_content.find(
                    'meta', {'property': 'og:novel:category'})
                a_author = soup_keyword_content.find(
                    'meta', {'property': 'og:novel:author'})
                a_url = soup_keyword_content.find(
                    'meta', {'property': 'og:novel:read_url'})
                a_status = soup_keyword_content.find(
                    'meta', {'property': 'og:novel:status'})
                a_list = soup_keyword_content.find('div', {'id': 'list'})
                a_book = a_list.dl.find_all('dd')

                #下载封面图片

                for i in range(5):
                    if download_path != None:
                        path = '%s/Fiction/%s/' % (download_path, self.keyword)
                        if not BaseRequest.DownloadData(
                                a_image['content'], path, "封面.jpg"):
                            print("download %s failed %d time" % ("封面.jpg", i))
                        else:
                            print("download %s%s success" % (path, "封面.jpg"))
                            break

                src = "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Fiction/" + self.keyword + "/" + "封面.jpg"

                #将漫画信息存储到数据库
                sql_dict = collections.OrderedDict()
                sql_dict['name'] = "\"" + a_name['content'] + "\""  #名字
                sql_dict['watch_count'] = 0  #编号
                sql_dict['website'] = "\"" + self._url + "\""  #网址
                sql_dict['chapter_count'] = len(a_book)  #总共有多少章节
                sql_dict[
                    'introduce'] = "\"" + a_introduce['content'] + "\""  #漫画介绍
                sql_dict['author'] = "\"" + a_author['content'] + "\""  #作者
                sql_dict['cover_img_src'] = "\"" + src + "\""  #封面图片
                sql_dict['type'] = "\"" + a_category['content'] + "\""  #漫画类型
                sql_dict['add_time'] = "\"" + time.strftime(
                    "%Y-%m-%d %H:%M:%S", time.localtime()) + "\""  #下载时间
                if "连载中" in a_status['content']:
                    sql_dict['is_finish'] = 0  #是否完结
                else:
                    sql_dict['is_finish'] = 1

                if not self._EntertainmentInsert('tbl_fiction_name', sql_dict):
                    print("inster tbl_fiction_name table failed!")
                    continue

                #获取漫画编号,唯一
                sql = "SELECT ID FROM EntertainmentDB.tbl_fiction_name WHERE name=\"%s\";" % (
                    a_name['content'])
                max_id = self._EntertainmentSelect(sql)
                if max_id:
                    self.id = max_id[0][0]
                else:
                    print("get max_id failed!")
                    continue

            elif mode == "update":
                now_Time = "\"" + time.strftime("%Y-%m-%d %H:%M:%S",
                                                time.localtime()) + "\""  #下载时间
                sql = "update EntertainmentDB.tbl_fiction_name set add_time = %s  where pk_id = %d;" % (
                    now_Time, self.id)
                if not self._EntertainmentUpdate(sql):
                    print("%s update failed!" % (sql))

            count = 1
            for book in a_book:

                href = book.a['href']
                title = book.a.contents[0]

                #当前章节的内容插入到队列中
                url_a_book = self._url + href
                data = {
                    "ID": self.id,
                    "url": url_a_book,
                    "title": title,
                    "href": href,
                    "count": count
                }
                if mode == "download":
                    dic_queue = {
                        "type": "download",
                        "subtype": "download",
                        "self": self,
                        "data": data
                    }
                elif mode == "update":
                    dic_queue = {
                        "type": "download",
                        "subtype": "update",
                        "self": self,
                        "data": data
                    }

                priority_queue.put(base.Job(2, dic_queue, self._url))

                count += 1

        return True