def test_parse_conf(self): """test parse_conf method""" conf_path = os.path.join(ROOT_PATH, './src/spider.conf') conf_dict = spider_util.parse_conf(conf_path) output_directory = conf_dict.get('output_directory') target_url = conf_dict.get('target_url') max_depth = conf_dict.get('max_depth') crawl_interval = conf_dict.get('crawl_interval') crawl_timeout = conf_dict.get('crawl_timeout') thread_count = conf_dict.get('thread_count') url_list_file = conf_dict.get('url_list_file') self.assertFalse(None in [output_directory, url_list_file, target_url, max_depth, crawl_interval, crawl_timeout, thread_count])
def main(): """入口""" opts = parse_args() logging.info(opts) conf_path = opts.conf_file conf_dict = spider_util.parse_conf(conf_path) output_directory = conf_dict.get('output_directory') target_url = conf_dict.get('target_url') max_depth = conf_dict.get('max_depth') crawl_interval = conf_dict.get('crawl_interval') crawl_timeout = conf_dict.get('crawl_timeout') thread_count = conf_dict.get('thread_count') url_list_file = conf_dict.get('url_list_file') url_list = get_urls_from_file(url_list_file) if None in [output_directory, url_list_file, target_url, max_depth, crawl_interval, crawl_timeout, thread_count] or not url_list: logging.error('conf options error') sys.exit(1) thread_count = int(thread_count) crawl_timeout = float(crawl_timeout) crawl_interval = float(crawl_interval) max_depth = int(max_depth) page_saver = saver.PageSaver(output_directory, target_url) page_downloader = downloader.PageDownloader(crawl_interval, crawl_timeout) manager = spider_thread.ThreadManager(max_depth, url_list, thread_count, page_downloader, page_saver) manager.wait_all_done()