def test_typical_urls(self, test_urls, expect_failed_urls_number,
                          expect_finished_urls_number):
        downloader = Downloader(Config(DEFAULT_INI_PATH))
        self.fast_download(downloader)

        result = downloader.get_result(test_urls)
        result.show_time_cost()
        assert len(result.get_failed_urls()) == expect_failed_urls_number
        assert len(result.get_finished_urls()) == expect_finished_urls_number
Exemple #2
0
 def __init__(self, crawler, close_callback):
     logger.debug('New Brain')
     self.spider = crawler.spider
     self.running = False
     self.closing = False
     self.inprogress = set()
     self.close_callback = close_callback
     self.downloader = Downloader(crawler)
     self.scraper = Scraper(crawler)
     self.scheduler = Scheduler(crawler)
    def test_customize_url_manager(self, test_urls, expect_failed_urls_number,
                                   expect_finished_urls_number):
        url_manger = UrlManager()
        for url in test_urls:
            url_manger.add_url(url)
        downloader = Downloader(Config(DEFAULT_INI_PATH))
        self.fast_download(downloader)

        result = downloader.get_result(test_urls, url_manger)
        assert len(result.get_failed_urls()) == expect_failed_urls_number
        assert len(result.get_finished_urls()) == expect_finished_urls_number
    def start(self):
        t1 = time.time()
        downloader = Downloader(
            Config("hardworking_av_studio.ini", HARDWORKING_CONFIG_DICT,
                   HARDWORKING_CONFIG_SCHEMA))
        urls = ['https://www.dmmsee.zone/studio/0']
        urls.extend([
            'https://www.dmmsee.zone/studio/{}{}'.format(i, word)
            for i in range(1, 40) for word in ' ' + string.ascii_lowercase
        ])
        urls.extend([
            'https://www.dmmsee.zone/studio/{}'.format(i)
            for i in range(40, 400)
        ])
        print(" config ".center(shutil.get_terminal_size().columns - 1, '*'))
        downloader.config.list_config()
        print(" download urls ".center(shutil.get_terminal_size().columns - 1,
                                       '*'))
        self.result = downloader.get_result(urls)
        self.result.show_time_cost()
        self.result.show_urls_status()
        print(" retry failed urls ".center(
            shutil.get_terminal_size().columns - 1, '*'))
        self.result.retry_failed_urls()
        self.result.show_urls_status()

        if os.path.exists("hardworking_av_studio.txt"):
            os.remove("hardworking_av_studio.txt")

        print(" analyzing result ".center(
            shutil.get_terminal_size().columns - 1, '*'))
        tmp_time = time.time()
        analyzing_result_process_number = downloader.config.get_config(
            'multi', 'analyzing_result_process_number')
        queue = SimpleQueue()
        for i in range(analyzing_result_process_number):
            Process(target=self.screen_by_mini_date,
                    args=(self.result.get_finished_urls()
                          [i::analyzing_result_process_number],
                          queue)).start()
        for i in tqdm(range(len(self.result.get_finished_urls())),
                      total=len(self.result.get_finished_urls()),
                      desc="analyzing result",
                      unit="result",
                      postfix={"process": analyzing_result_process_number}):
            queue.get()
        print("\nanalysis completed... time cost {:.2f}s".format(time.time() -
                                                                 tmp_time))
        print(" result ".center(shutil.get_terminal_size().columns, '*'))
        print("The result has been written to the current folder:",
              os.path.join(os.getcwd(), "hardworking_av_studio.txt"))
        print("total time cost {:.2f}s".format(time.time() - t1))
        return True
 def test_change_config(self):
     downloader = Downloader(Config(DEFAULT_INI_PATH))
     customize_dict = {
         'multi': {
             'process_number': 1,
             'thread_number': 1,
             'delay': 2.0
         },
         'customize': {
             'use': 1,
             'char': 'a'
         }
     }
     customize_ini_path = './tests/config/customize_config.ini'
     config = Config(customize_ini_path, customize_dict)
     downloader.change_config(config)
 def test_chinese_support(self):
     test_gb2312_urls = ["https://www.biqukan.com/50_50758/"]
     downloader = Downloader(Config(DEFAULT_INI_PATH))
     downloader.enable_chinese_transcoding()
     result = downloader.get_result(test_gb2312_urls)
     downloader.disable_chinese_transcoding()
     assert len(result.get_urls_detail_dict()) == 1
 def __init__(self):
     self.novel_name = None
     self.downloader = Downloader(
         Config("novel_downloader.ini", NOVEL_DOWNLOADER_CONFIG_DICT))
     self.downloader.enable_chinese_transcoding()
class NovelDownloader:
    def __init__(self):
        self.novel_name = None
        self.downloader = Downloader(
            Config("novel_downloader.ini", NOVEL_DOWNLOADER_CONFIG_DICT))
        self.downloader.enable_chinese_transcoding()

    def find_novel(self):
        while True:
            self.novel_name = input('请输入小说名称: ')
            t1 = time.time()
            url = SERVER + self.novel_name
            req = self.downloader.get_req(url)
            soup = BeautifulSoup(req.text, features='lxml')
            soup.prettify()
            s2 = soup.find_all(class_='s2')
            if len(s2) >= 2:
                text = s2[1].text
                href = next(s2[1].children).get('href')
                for span in s2:
                    if span.text == self.novel_name:
                        text = span.text
                        href = next(span.children).get('href')
                        break
                self.novel_name = text
                print("搜索到小说: 《{}》, 耗时: {:.2f}s".format(
                    novel_downloader.novel_name,
                    time.time() - t1))
                return href
            print("查无此小说, 请重试")

    def get_chapter_url_list(self, url: str) -> list:
        def novel_start(chapter_name):
            start_pattern = '第' + '[0零]*' + '[零01一]' + '章'
            return re.match(start_pattern, chapter_name)

        req = self.downloader.get_req(url)
        soup = BeautifulSoup(req.text, features='lxml')
        soup.prettify()
        div = soup.find_all('div', class_='listmain')
        a_soup = BeautifulSoup(str(div[0]), features='lxml')
        a = a_soup.find_all('a')

        chapter_url_list = []
        start_flag = False
        for each in a:
            if start_flag == 0 and (novel_start(each.get_text()) is not None):
                start_flag = 1
                chapter_url_list.append(parse.urljoin(url, each.get('href')))
            elif start_flag == 1:
                chapter_url_list.append(parse.urljoin(url, each.get('href')))
        return chapter_url_list

    @staticmethod
    def fill_novel_dict(urls, novel_dict, queue):
        advertising_pattern = r'\r|&1t;p>|&1t;/p>|&1t;i>|&1t;/i>' \
                              r'|手机阅读地址:http://m.biqukan.com,数据和书签与电脑站同步,无广告清新阅读!' \
                              r'|水印广告测试' \
                              r'|手机阅读:m.biqukan.com' \
                              r'|百度搜索“笔趣看小说网”' \
                              r'|;\[笔趣看  www.biqukan.com\]'
        end_line_pattern = r'<br {0,1}/>'
        for url in urls:
            req = novel_dict[url]
            if isinstance(req, Exception):
                novel_dict[url] = repr(req)

            text = re.sub(advertising_pattern, '', req.text)
            text = re.sub(end_line_pattern, '\r\n', text)

            soup = BeautifulSoup(text, features='lxml')
            soup.prettify()

            texts = soup.find_all('div', class_='showtxt')
            h1 = soup.find('h1')

            span = re.search("https://www.biqukan.com", texts[0].text)
            if span is not None:
                bi_qu_kan_advertising = span.span()[0] - 1
            else:
                bi_qu_kan_advertising = -1
            novel_dict[
                url] = h1.text + '\r\n' + texts[0].text[:bi_qu_kan_advertising]
            queue.put(url)
        return True

    def get_novel_dict(self, chapter_url_list: list) -> dict:
        result = self.downloader.get_result(chapter_url_list)
        result.show_time_cost()
        result.show_urls_status()
        print(" 重试失败章节 ".center(shutil.get_terminal_size().columns - 7, '*'))
        result.retry_failed_urls()
        result.show_urls_status()

        print(" 分离章节内容 ".center(shutil.get_terminal_size().columns - 7, '*'))
        process_number = self.downloader.config.get_config(
            "multi", "process_number")
        process_number = int(process_number //
                             1.5) if process_number > 2 else process_number
        queue = SimpleQueue()
        for i in range(process_number):
            Process(target=self.fill_novel_dict,
                    args=(chapter_url_list[i::process_number],
                          result.get_urls_detail_dict(), queue)).start()
        for i in tqdm(range(len(chapter_url_list)),
                      total=len(chapter_url_list),
                      desc="分离章节内容",
                      unit="章节",
                      postfix={"process": process_number}):
            queue.get()

        return result.get_urls_detail_dict()

    def start(self):
        print(" 配置文件 ".center(shutil.get_terminal_size().columns - 5, '*'))
        self.downloader.config.list_config()
        print(" 搜索小说 ".center(shutil.get_terminal_size().columns - 5, '*'))
        novel_chapter_url = self.find_novel()
        t1 = time.time()
        chapter_url_list = self.get_chapter_url_list(novel_chapter_url)
        print("分析章节列表完成, 耗时: {:.2f}s".format(time.time() - t1))
        print(" 下载小说 ".center(shutil.get_terminal_size().columns - 5, '*'))
        novel_dict = self.get_novel_dict(chapter_url_list)
        print(" 小说写入文件 ".center(shutil.get_terminal_size().columns - 7, '*'))
        with open(self.novel_name + '.txt', 'w+', encoding='utf-8') as file:
            for chapter_url in chapter_url_list:
                file.write(novel_dict[chapter_url])
        print('小说写入完成, 总耗时: {:.2f}s'.format(time.time() - t1))
        print('小说位置: ', os.path.join(os.getcwd(), self.novel_name + '.txt'))
 def test_get_result_empty_urls(self):
     downloader = Downloader(Config(DEFAULT_INI_PATH))
     assert downloader.get_result(test_empty_urls) is None
Exemple #10
0
class Brain():
    def __init__(self, crawler, close_callback):
        logger.debug('New Brain')
        self.spider = crawler.spider
        self.running = False
        self.closing = False
        self.inprogress = set()
        self.close_callback = close_callback
        self.downloader = Downloader(crawler)
        self.scraper = Scraper(crawler)
        self.scheduler = Scheduler(crawler)

    def start(self):
        self.scheduler.start()
        self.scraper.start(self.spider)
        self.downloader.start()
        reactor.callLater(0, self.next)

    @defer.inlineCallbacks
    def run(self):
        if self.running:
            raise "Already running"
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def try_close(self):
        logger.debug('Trying to close brain')
        if (self.closing and not self.inprogress) or self.scraper.try_close():
            logger.debug('Brain can be closed now')
            self.closing.callback(None)

    def next(self):
        logger.debug("Brain next Event loop !")
        while not self.is_busy():
            if not self.from_scheduler():
                break
        if self.spider.entrypoint and not self.is_busy():
            self.crawl(self.spider.entrypoint)
            self.spider.entrypoint = None

    def is_busy(self):
        ret = not self.running or self.closing or self.downloader.is_busy(
        ) or self.scraper.is_busy()
        if ret:
            logger.debug("Brain is busy!")
        return ret

    def from_scheduler(self):
        request = self.scheduler.dequeue_request()
        logger.debug('Brain calling next request from scheduler')
        if not request:
            return None
        self.inprogress.add(request)
        d = self.downloader.download(request)
        d.addErrback(self.downloader_error)
        d.addCallback(self.scraper.enqueue_scrape, request)
        d.addErrback(self.scrapper_error)

        def _then(response):
            reactor.callLater(0, self.next)
            self.inprogress.remove(request)
            return response

        d.addBoth(_then)
        d.addBoth(lambda _: self.try_close())
        return d.addBoth(lambda _: reactor.callLater(0, self.next))

    @staticmethod
    def scrapper_error(err):
        logger.error(f"After scrapper.enqueue_scrape: {err}")
        return err

    @staticmethod
    def downloader_error(err):
        logger.error(f"After downloader.download: {err}")
        return err

    def crawl(self, request):
        logger.debug(f"Crawling for {request.url}")
        self.scheduler.enqueue_request(request)
        reactor.callLater(0, self.next)

    @staticmethod
    def close():
        logger.debug('Close Brain')

    def stop(self):
        logger.debug('Stop Brain')
        if not self.running:
            return None
        self.running = False
        self.stop_all()
        return self._closewait.callback(None)

    def stop_all(self):
        if self.closing:
            return self.closing
        self.closing = defer.Deferred()
        self.try_close()
        d = self.closing
        d.addBoth(lambda _: self.downloader.close())
        d.addErrback(
            lambda _: logger.error('ERROR in BRAIN after downloader.close'))
        d.addBoth(lambda _: self.scraper.close())
        d.addErrback(
            lambda _: logger.error('ERROR in BRAIN after scraper.close'))
        d.addBoth(lambda _: self.scheduler.close())
        d.addErrback(
            lambda _: logger.error('ERROR in BRAIN after scheduler.close'))
        d.addBoth(lambda _: self.close_callback())
        return d