Ejemplo n.º 1
0
    def test_parse_conf(self):
        """test parse_conf method"""
        conf_path = os.path.join(ROOT_PATH, './src/spider.conf')
        conf_dict = spider_util.parse_conf(conf_path)

        output_directory = conf_dict.get('output_directory')
        target_url = conf_dict.get('target_url')
        max_depth = conf_dict.get('max_depth')
        crawl_interval = conf_dict.get('crawl_interval')
        crawl_timeout = conf_dict.get('crawl_timeout')
        thread_count = conf_dict.get('thread_count')
        url_list_file = conf_dict.get('url_list_file')
        self.assertFalse(None in [output_directory,
                                  url_list_file,
                                  target_url,
                                  max_depth,
                                  crawl_interval,
                                  crawl_timeout,
                                  thread_count])
Ejemplo n.º 2
0
def main():
    """入口"""
    opts = parse_args()
    logging.info(opts)
    conf_path = opts.conf_file

    conf_dict = spider_util.parse_conf(conf_path)

    output_directory = conf_dict.get('output_directory')
    target_url = conf_dict.get('target_url')

    max_depth = conf_dict.get('max_depth')
    crawl_interval = conf_dict.get('crawl_interval')
    crawl_timeout = conf_dict.get('crawl_timeout')

    thread_count = conf_dict.get('thread_count')

    url_list_file = conf_dict.get('url_list_file')
    url_list = get_urls_from_file(url_list_file)

    if None in [output_directory,
                url_list_file,
                target_url,
                max_depth,
                crawl_interval,
                crawl_timeout,
                thread_count] or not url_list:
        logging.error('conf options error')
        sys.exit(1)
    thread_count = int(thread_count)
    crawl_timeout = float(crawl_timeout)
    crawl_interval = float(crawl_interval)
    max_depth = int(max_depth)
    page_saver = saver.PageSaver(output_directory, target_url)
    page_downloader = downloader.PageDownloader(crawl_interval, crawl_timeout)
    manager = spider_thread.ThreadManager(max_depth,
                                          url_list,
                                          thread_count,
                                          page_downloader,
                                          page_saver)
    manager.wait_all_done()