Example #1
0
    def __init__(self):
        self.downloader = aszwDownloader.Downloader()
        self.parser = aszwParser.Parser()

        # cookie库
        self.cookies = dbController.dbc('bookwarehouse').getCookies()
        # 代理库
        self.proxies = proxies.get_proxy('http://www.xicidaili.com/nn/',
                                         {'User-agent': 'Mr.Zhang'})
Example #2
0
    def initDatebase(self):
        '''
        初始化数据库:爬取整个书籍网站至数据库,
        :return:
        '''
        # 初始化解析模块
        parser = aszwParser.Parser()
        downloader = aszwDownloader.Downloader()
        cookies = self.getCookies()

        # 从傲视中文网的书籍列表中把列表url爬取下来
        list_url = parser.find_list_urls()
        # 遍历列表url得到包含书目录url的列表
        for list in list_url:
            # 从列表页面中解析出书url列表
            books_urls = parser.find_books_urls(list)
            # 遍历书的主页
            for book_url in books_urls:
                # 解析主页得到章节url列表、书名、类别、作者
                sections_url, title, category, auth = parser.find_section_urls(
                    book_url)
                book = {}
                book['name'] = title
                book['category'] = category
                book['auth'] = auth
                book['wordage'] = -1
                book['book_url'] = book_url
                book['source'] = 1
                chapters = []
                for section_url in sections_url:
                    # 从cookie库中随机获取一个cookie用于下载页面
                    cookie = cookies[random.randint(0, 10)]
                    # 遍历章节页面,解析出章节名和正文
                    html_cont = downloader.m_download(section_url, cookie)
                    new_data = parser.parser_Section(html_cont)
                    # 将章节名和章节url存入chapters
                    chapter = {
                        'chapter_name': new_data['section_title'],
                        'chapter_url': section_url
                    }
                    chapters.append(chapter)
                # 章节信息列表存入book中
                book['chapters'] = chapters
                self.insetBook(book)
                self.book_warehouse.append(book)
Example #3
0
    def getBook(self, id):
        '''
        下载书籍至服务器
        :param id:
        :return:
        '''

        # 获取书目url
        cursor = self.db.cursor()
        sql = "select * from books where id = %s"
        cursor.execute(sql, (id))
        row = cursor.fetchone()
        book_url = row[7]

        # 初始化解析模块
        parser = aszwParser.Parser()
        downloader = aszwDownloader.Downloader()
        cookies = self.getCookies()
        user_agent = self.getUserAgent()

        proxy_list = proxies.get_proxy('http://www.xicidaili.com/nn/',
                                       {'User-agent': 'Mr.Zhang'})

        # 解析主页得到章节url列表、书名、类别、作者
        sections_url, title, category, auth = parser.find_section_urls(
            book_url)
        print('----正在爬取书籍:', title)
        chapters = []

        # 若存在书籍
        if os.path.exists("/home/ubuntu/book/" + title + "_" + auth + ".txt"):
            print(title + "已下载。。。")
            return

        # 书籍内容抓取器
        outputer = aszwWriter.Writer(len(sections_url), title, auth)

        # 解析章节信息存入chapters
        def parseSction(section_url):
            try:
                # 从cookie库中随机获取一个cookie用于下载页面
                cookie = cookies[random.randint(0, 10)]
                proxy = random.choice(proxy_list)
                # 遍历章节页面,解析出章节名和正文
                html_cont = downloader.m_download(
                    section_url,
                    cookie=cookie,
                    user_agent=random.choice(user_agent),
                    proxy=proxy)
                new_data = parser.parser_Section(html_cont)

                # print('爬取第',new_data['section_title'],'章成功')

                # 收集章节内容以便章节爬取结束后写入文件
                outputer.collect_data(new_data)

                # 使用外部变量
                nonlocal threads, chapters

                # 将章节名和章节url存入chapters
                chapter = {
                    'chapter_name': new_data['section_title'],
                    'chapter_url': section_url,
                    'chapter_context': new_data['text']
                }
                chapters.append(chapter)
            except Exception as e:
                print(e)
            finally:
                # threads线程必须放置在finally中-1,否则当该函数出现bug停掉,则threads不会被-1
                # 退出线程,线程数-1
                threads -= 1

        # 多线程解析章节内容
        threads = 0
        print('需解析的章节数:', len(sections_url))
        while sections_url:
            while sections_url and threads < 40:
                # print(threads)
                threads += 1
                section_url = sections_url.pop()
                _thread.start_new_thread(parseSction, (section_url, ))

        while threads > 0:
            # print('已爬取',sum1,'已进入爬取',sum2)
            # print(threads)
            pass

        print('开始写入书籍至本地')
        # 写入书籍内容到文件
        print(outputer.output_html())
Example #4
0
    def initDatebaseContext(self):
        '''
        初始化数据库:爬取整个书籍网站至数据库,包括章节内容
        :return:
        '''
        # 初始化解析模块
        parser = aszwParser.Parser()
        downloader = aszwDownloader.Downloader()
        cookies = self.getCookies()
        user_agent = self.getUserAgent()
        proxy_list = proxies.get_proxy('http://www.xicidaili.com/nn/',
                                       {'User-agent': 'Mr.Zhang'})

        # 从傲视中文网的书籍列表中把列表url爬取下来
        list_url = parser.find_list_urls()
        # 遍历列表url得到包含书目录url的列表
        for list in list_url:
            # 从列表页面中解析出书url列表
            books_urls = parser.find_books_urls(list)
            # 遍历书的主页
            for book_url in books_urls:
                # 解析主页得到章节url列表、书名、类别、作者
                try:
                    # 若存在书籍
                    if self.checkBookExist(book_url):
                        continue

                    sections_url, title, category, auth = parser.find_section_urls(
                        book_url)
                    book = {}
                    print('----正在爬取书籍:', title)
                    book['name'] = title
                    book['category'] = category
                    book['auth'] = auth
                    book['wordage'] = -1
                    book['book_url'] = book_url
                    book['source'] = 1
                    chapters = []

                    # 书籍字数
                    wordage = 0

                    # 解析章节信息存入chapters
                    def parseSction(section_url, i):
                        # 从cookie库中随机获取一个cookie用于下载页面
                        cookie = cookies[random.randint(0, 10)]
                        proxy = random.choice(proxy_list)
                        # 遍历章节页面,解析出章节名和正文
                        html_cont = downloader.m_download(
                            section_url,
                            cookie=cookie,
                            user_agent=random.choice(user_agent),
                            proxy=proxy)
                        new_data = parser.parser_Section(html_cont)

                        try:
                            # 使用外部变量
                            nonlocal threads, chapters, wordage

                            # 将章节名和章节url存入chapters
                            chapter = {
                                'chapter_name': i,
                                'chapter_url': section_url,
                                'context': new_data['text']
                            }
                            chapters.append(chapter)

                            wordage += len(
                                new_data['text']
                            ) - new_data['text'].count("    ") * 4 - 25
                        except Exception as e:
                            print(section_url, '-----章节内容获取失败')
                        finally:
                            # 退出线程,线程数-1
                            threads -= 1

                    # 多线程访问
                    threads = 0
                    # 章节url的key值
                    i = 1
                    while sections_url:
                        while sections_url and threads < 20:
                            threads += 1
                            section_url = sections_url.pop(i)
                            _thread.start_new_thread(parseSction, (
                                section_url,
                                self.i2a(i),
                            ))
                            i += 1
                    # for section_url in sections_url:
                    #     # 从cookie库中随机获取一个cookie用于下载页面
                    #     cookie = cookies[random.randint(0, 10)]
                    #     proxy = random.choice(proxy_list)
                    #     # 遍历章节页面,解析出章节名和正文
                    #     html_cont = downloader.m_download(section_url,cookie=cookie,user_agent=random.choice(user_agent),proxy=proxy)
                    #     new_data = parser.parser_Section(html_cont)
                    #     # 将章节名和章节url存入chapters
                    #     chapter = {'chapter_name': new_data['section_title'], 'chapter_url': section_url}
                    #     chapters.append(chapter)
                    # 章节信息列表存入book中
                    book['chapters'] = chapters
                    # 书籍字数存入book中
                    book['wordage'] = wordage
                    self.insetBook(book)
                    self.book_warehouse.append(book)
                except Exception as e:
                    s = sys.exc_info()
                    print("Error '%s' happened on line %d" %
                          (s[1], s[2].tb_lineno))
                    books_urls.append(book_url)
Example #5
0
    def initDatebase(self):
        '''
        初始化数据库:爬取整个书籍网站至数据库,
        :return:
        '''
        # 初始化解析模块
        parser = aszwParser.Parser()
        downloader = aszwDownloader.Downloader()
        cookies = self.getCookies()
        user_agent = self.getUserAgent()
        proxy_list = proxies.get_proxy('http://www.xicidaili.com/nn/',
                                       {'User-agent': 'Mr.Zhang'})

        # 从傲视中文网的书籍列表中把列表url爬取下来
        list_url = parser.find_list_urls()
        # 遍历列表url得到包含书目录url的列表
        for list in list_url:
            # 从列表页面中解析出书url列表
            books_urls = parser.find_books_urls(list)
            # 遍历书的主页
            for book_url in books_urls:
                # 解析主页得到章节url列表、书名、类别、作者
                sections_url, title, category, auth = parser.find_section_urls(
                    book_url)
                book = {}
                print('----正在爬取书籍:', title)
                book['name'] = title
                book['category'] = category
                book['auth'] = auth
                book['wordage'] = -1
                book['book_url'] = book_url
                book['source'] = 1
                chapters = []

                # 若存在书籍
                if os.path.exists("/home/ubuntu/book/" + title + "_" + auth +
                                  ".txt"):
                    print(title + "已下载。。。")
                    continue

                outputer = aszwWriter.Writer(len(sections_url), title, auth)

                # 解析章节信息存入chapters
                def parseSction(section_url):
                    # 从cookie库中随机获取一个cookie用于下载页面
                    cookie = cookies[random.randint(0, 10)]
                    proxy = random.choice(proxy_list)
                    # 遍历章节页面,解析出章节名和正文
                    html_cont = downloader.m_download(
                        section_url,
                        cookie=cookie,
                        user_agent=random.choice(user_agent),
                        proxy=proxy)
                    new_data = parser.parser_Section(html_cont)

                    # 使用外部变量
                    nonlocal threads, chapters

                    # 将章节名和章节url存入chapters
                    chapter = {
                        'chapter_name': new_data['section_title'],
                        'chapter_url': section_url
                    }
                    chapters.append(chapter)

                    # 收集章节内容以便章节爬取结束后写入文件
                    outputer.collect_data(new_data)

                    # 退出线程,线程数-1
                    threads -= 1

                # 多线程访问
                threads = 0
                i = 1
                while sections_url:
                    while sections_url and threads < 40:
                        threads += 1
                        section_url = sections_url.pop()
                        _thread.start_new_thread(parseSction, (section_url, ))
                        i += 1
                # for section_url in sections_url:
                #     # 从cookie库中随机获取一个cookie用于下载页面
                #     cookie = cookies[random.randint(0, 10)]
                #     proxy = random.choice(proxy_list)
                #     # 遍历章节页面,解析出章节名和正文
                #     html_cont = downloader.m_download(section_url,cookie=cookie,user_agent=random.choice(user_agent),proxy=proxy)
                #     new_data = parser.parser_Section(html_cont)
                #     # 将章节名和章节url存入chapters
                #     chapter = {'chapter_name': new_data['section_title'], 'chapter_url': section_url}
                #     chapters.append(chapter)
                # 章节信息列表存入book中
                book['chapters'] = chapters
                self.insetBook(book)
                self.book_warehouse.append(book)

                print('开始写入书籍至本地')
                # 写入书籍内容到文件
                print(outputer.output_html())
Example #6
0
 def __init__(self):
     self.downloader = aszwDownloader.Downloader()
     self.parser = aszwParser.Parser()
     self.cookies = dbController.dbc('bookwarehouse').getCookies()
Example #7
0
def testfind_section_urls():
    url='https://www.23zw.me/olread/79/79709/index.html'
    parser= aszwParser.Parser()
    parser.find_section_urls(url)