def test_typical_urls(self, test_urls, expect_failed_urls_number, expect_finished_urls_number): downloader = Downloader(Config(DEFAULT_INI_PATH)) self.fast_download(downloader) result = downloader.get_result(test_urls) result.show_time_cost() assert len(result.get_failed_urls()) == expect_failed_urls_number assert len(result.get_finished_urls()) == expect_finished_urls_number
def __init__(self, crawler, close_callback): logger.debug('New Brain') self.spider = crawler.spider self.running = False self.closing = False self.inprogress = set() self.close_callback = close_callback self.downloader = Downloader(crawler) self.scraper = Scraper(crawler) self.scheduler = Scheduler(crawler)
def test_customize_url_manager(self, test_urls, expect_failed_urls_number, expect_finished_urls_number): url_manger = UrlManager() for url in test_urls: url_manger.add_url(url) downloader = Downloader(Config(DEFAULT_INI_PATH)) self.fast_download(downloader) result = downloader.get_result(test_urls, url_manger) assert len(result.get_failed_urls()) == expect_failed_urls_number assert len(result.get_finished_urls()) == expect_finished_urls_number
def start(self): t1 = time.time() downloader = Downloader( Config("hardworking_av_studio.ini", HARDWORKING_CONFIG_DICT, HARDWORKING_CONFIG_SCHEMA)) urls = ['https://www.dmmsee.zone/studio/0'] urls.extend([ 'https://www.dmmsee.zone/studio/{}{}'.format(i, word) for i in range(1, 40) for word in ' ' + string.ascii_lowercase ]) urls.extend([ 'https://www.dmmsee.zone/studio/{}'.format(i) for i in range(40, 400) ]) print(" config ".center(shutil.get_terminal_size().columns - 1, '*')) downloader.config.list_config() print(" download urls ".center(shutil.get_terminal_size().columns - 1, '*')) self.result = downloader.get_result(urls) self.result.show_time_cost() self.result.show_urls_status() print(" retry failed urls ".center( shutil.get_terminal_size().columns - 1, '*')) self.result.retry_failed_urls() self.result.show_urls_status() if os.path.exists("hardworking_av_studio.txt"): os.remove("hardworking_av_studio.txt") print(" analyzing result ".center( shutil.get_terminal_size().columns - 1, '*')) tmp_time = time.time() analyzing_result_process_number = downloader.config.get_config( 'multi', 'analyzing_result_process_number') queue = SimpleQueue() for i in range(analyzing_result_process_number): Process(target=self.screen_by_mini_date, args=(self.result.get_finished_urls() [i::analyzing_result_process_number], queue)).start() for i in tqdm(range(len(self.result.get_finished_urls())), total=len(self.result.get_finished_urls()), desc="analyzing result", unit="result", postfix={"process": analyzing_result_process_number}): queue.get() print("\nanalysis completed... time cost {:.2f}s".format(time.time() - tmp_time)) print(" result ".center(shutil.get_terminal_size().columns, '*')) print("The result has been written to the current folder:", os.path.join(os.getcwd(), "hardworking_av_studio.txt")) print("total time cost {:.2f}s".format(time.time() - t1)) return True
def test_change_config(self): downloader = Downloader(Config(DEFAULT_INI_PATH)) customize_dict = { 'multi': { 'process_number': 1, 'thread_number': 1, 'delay': 2.0 }, 'customize': { 'use': 1, 'char': 'a' } } customize_ini_path = './tests/config/customize_config.ini' config = Config(customize_ini_path, customize_dict) downloader.change_config(config)
def test_chinese_support(self): test_gb2312_urls = ["https://www.biqukan.com/50_50758/"] downloader = Downloader(Config(DEFAULT_INI_PATH)) downloader.enable_chinese_transcoding() result = downloader.get_result(test_gb2312_urls) downloader.disable_chinese_transcoding() assert len(result.get_urls_detail_dict()) == 1
def __init__(self): self.novel_name = None self.downloader = Downloader( Config("novel_downloader.ini", NOVEL_DOWNLOADER_CONFIG_DICT)) self.downloader.enable_chinese_transcoding()
class NovelDownloader: def __init__(self): self.novel_name = None self.downloader = Downloader( Config("novel_downloader.ini", NOVEL_DOWNLOADER_CONFIG_DICT)) self.downloader.enable_chinese_transcoding() def find_novel(self): while True: self.novel_name = input('请输入小说名称: ') t1 = time.time() url = SERVER + self.novel_name req = self.downloader.get_req(url) soup = BeautifulSoup(req.text, features='lxml') soup.prettify() s2 = soup.find_all(class_='s2') if len(s2) >= 2: text = s2[1].text href = next(s2[1].children).get('href') for span in s2: if span.text == self.novel_name: text = span.text href = next(span.children).get('href') break self.novel_name = text print("搜索到小说: 《{}》, 耗时: {:.2f}s".format( novel_downloader.novel_name, time.time() - t1)) return href print("查无此小说, 请重试") def get_chapter_url_list(self, url: str) -> list: def novel_start(chapter_name): start_pattern = '第' + '[0零]*' + '[零01一]' + '章' return re.match(start_pattern, chapter_name) req = self.downloader.get_req(url) soup = BeautifulSoup(req.text, features='lxml') soup.prettify() div = soup.find_all('div', class_='listmain') a_soup = BeautifulSoup(str(div[0]), features='lxml') a = a_soup.find_all('a') chapter_url_list = [] start_flag = False for each in a: if start_flag == 0 and (novel_start(each.get_text()) is not None): start_flag = 1 chapter_url_list.append(parse.urljoin(url, each.get('href'))) elif start_flag == 1: chapter_url_list.append(parse.urljoin(url, each.get('href'))) return chapter_url_list @staticmethod def fill_novel_dict(urls, novel_dict, queue): advertising_pattern = r'\r|&1t;p>|&1t;/p>|&1t;i>|&1t;/i>' \ r'|手机阅读地址:http://m.biqukan.com,数据和书签与电脑站同步,无广告清新阅读!' \ r'|水印广告测试' \ r'|手机阅读:m.biqukan.com' \ r'|百度搜索“笔趣看小说网”' \ r'|;\[笔趣看 www.biqukan.com\]' end_line_pattern = r'<br {0,1}/>' for url in urls: req = novel_dict[url] if isinstance(req, Exception): novel_dict[url] = repr(req) text = re.sub(advertising_pattern, '', req.text) text = re.sub(end_line_pattern, '\r\n', text) soup = BeautifulSoup(text, features='lxml') soup.prettify() texts = soup.find_all('div', class_='showtxt') h1 = soup.find('h1') span = re.search("https://www.biqukan.com", texts[0].text) if span is not None: bi_qu_kan_advertising = span.span()[0] - 1 else: bi_qu_kan_advertising = -1 novel_dict[ url] = h1.text + '\r\n' + texts[0].text[:bi_qu_kan_advertising] queue.put(url) return True def get_novel_dict(self, chapter_url_list: list) -> dict: result = self.downloader.get_result(chapter_url_list) result.show_time_cost() result.show_urls_status() print(" 重试失败章节 ".center(shutil.get_terminal_size().columns - 7, '*')) result.retry_failed_urls() result.show_urls_status() print(" 分离章节内容 ".center(shutil.get_terminal_size().columns - 7, '*')) process_number = self.downloader.config.get_config( "multi", "process_number") process_number = int(process_number // 1.5) if process_number > 2 else process_number queue = SimpleQueue() for i in range(process_number): Process(target=self.fill_novel_dict, args=(chapter_url_list[i::process_number], result.get_urls_detail_dict(), queue)).start() for i in tqdm(range(len(chapter_url_list)), total=len(chapter_url_list), desc="分离章节内容", unit="章节", postfix={"process": process_number}): queue.get() return result.get_urls_detail_dict() def start(self): print(" 配置文件 ".center(shutil.get_terminal_size().columns - 5, '*')) self.downloader.config.list_config() print(" 搜索小说 ".center(shutil.get_terminal_size().columns - 5, '*')) novel_chapter_url = self.find_novel() t1 = time.time() chapter_url_list = self.get_chapter_url_list(novel_chapter_url) print("分析章节列表完成, 耗时: {:.2f}s".format(time.time() - t1)) print(" 下载小说 ".center(shutil.get_terminal_size().columns - 5, '*')) novel_dict = self.get_novel_dict(chapter_url_list) print(" 小说写入文件 ".center(shutil.get_terminal_size().columns - 7, '*')) with open(self.novel_name + '.txt', 'w+', encoding='utf-8') as file: for chapter_url in chapter_url_list: file.write(novel_dict[chapter_url]) print('小说写入完成, 总耗时: {:.2f}s'.format(time.time() - t1)) print('小说位置: ', os.path.join(os.getcwd(), self.novel_name + '.txt'))
def test_get_result_empty_urls(self): downloader = Downloader(Config(DEFAULT_INI_PATH)) assert downloader.get_result(test_empty_urls) is None
class Brain(): def __init__(self, crawler, close_callback): logger.debug('New Brain') self.spider = crawler.spider self.running = False self.closing = False self.inprogress = set() self.close_callback = close_callback self.downloader = Downloader(crawler) self.scraper = Scraper(crawler) self.scheduler = Scheduler(crawler) def start(self): self.scheduler.start() self.scraper.start(self.spider) self.downloader.start() reactor.callLater(0, self.next) @defer.inlineCallbacks def run(self): if self.running: raise "Already running" self.running = True self._closewait = defer.Deferred() yield self._closewait def try_close(self): logger.debug('Trying to close brain') if (self.closing and not self.inprogress) or self.scraper.try_close(): logger.debug('Brain can be closed now') self.closing.callback(None) def next(self): logger.debug("Brain next Event loop !") while not self.is_busy(): if not self.from_scheduler(): break if self.spider.entrypoint and not self.is_busy(): self.crawl(self.spider.entrypoint) self.spider.entrypoint = None def is_busy(self): ret = not self.running or self.closing or self.downloader.is_busy( ) or self.scraper.is_busy() if ret: logger.debug("Brain is busy!") return ret def from_scheduler(self): request = self.scheduler.dequeue_request() logger.debug('Brain calling next request from scheduler') if not request: return None self.inprogress.add(request) d = self.downloader.download(request) d.addErrback(self.downloader_error) d.addCallback(self.scraper.enqueue_scrape, request) d.addErrback(self.scrapper_error) def _then(response): reactor.callLater(0, self.next) self.inprogress.remove(request) return response d.addBoth(_then) d.addBoth(lambda _: self.try_close()) return d.addBoth(lambda _: reactor.callLater(0, self.next)) @staticmethod def scrapper_error(err): logger.error(f"After scrapper.enqueue_scrape: {err}") return err @staticmethod def downloader_error(err): logger.error(f"After downloader.download: {err}") return err def crawl(self, request): logger.debug(f"Crawling for {request.url}") self.scheduler.enqueue_request(request) reactor.callLater(0, self.next) @staticmethod def close(): logger.debug('Close Brain') def stop(self): logger.debug('Stop Brain') if not self.running: return None self.running = False self.stop_all() return self._closewait.callback(None) def stop_all(self): if self.closing: return self.closing self.closing = defer.Deferred() self.try_close() d = self.closing d.addBoth(lambda _: self.downloader.close()) d.addErrback( lambda _: logger.error('ERROR in BRAIN after downloader.close')) d.addBoth(lambda _: self.scraper.close()) d.addErrback( lambda _: logger.error('ERROR in BRAIN after scraper.close')) d.addBoth(lambda _: self.scheduler.close()) d.addErrback( lambda _: logger.error('ERROR in BRAIN after scheduler.close')) d.addBoth(lambda _: self.close_callback()) return d