return config_file if "__main__" == __name__: #config log info log.init_log("./log/spider", logging.DEBUG) #get command line args config_file = parseCmdLine(sys.argv) if config_file is None: sys.exit(PARSE_CMDLINE_ARGUMENTS_ERROR) #load config file logging.debug('load config file...') config_para_code, config_para_item = config_load.load_config(config_file) #check load config file ok? if config_para_code != 0: #error print 'load config file error', config_para_item logging.error('load config file ' + str(config_para_code) + ', ' + config_para_item) sys.exit(LOAD_CONFIGFILE_ERROR) #load config ok logging.info('load config ok') conf_url_list_file = config_para_item['url_list_file'] conf_thread_count = config_para_item['thread_count'] #load seed file logging.debug('load seed file...')
def test_load_config_failure(self): """ test loadconfig function failure """ config_file = "spider1.conf" resno, resinfo = config_load.load_config(config_file) self.assertNotEqual(resno, 0)
def main(): """ spider main function usage: python mini_spider [options] options: -c CONFIG_FILE_PATH, --config_file_path CONFIG_FILE_PATH the spider config file path -h, --help show this help message and exit -v, --version show spider version and exit """ # init log log.init_log("../log/mini_spider") # parse args parser = argparse.ArgumentParser(description="mini directional spider") parser.add_argument("-v", "--version", action="store_true", help="show spider version and exit") parser.add_argument("-c", "--config_file_path", help="config file path") args = parser.parse_args() config_file_path = args.config_file_path config_file_path = '../conf/spider.conf' if args.version: print "mini spider 0.1" return 0 if config_file_path is None: usage = "usage: python mini_spider.py -c spider_conf_file_path" logging.info("the config path cannot be empty, " + usage) return -1 # read conf ret, config_map = config_load.load_config(config_file_path) if ret != 0: return ret # init some spider to run with multiply threading urls_queue = Queue.Queue() crawled_urls_list = [] code, urls_list = seedfile_load.get_urls(config_map.get('url_list_file', '')) if code != 0: return code if not urls_list: logging.error('the seed urls is empty.') return -1 for url in urls_list: url_item = {'url': url, 'depth': 0} urls_queue.put(url_item) thread_count = config_map.get('thread_count', 1) thread_list = [] __init_output_dir(config_map.get('output_directory', '.')) for i in xrange(thread_count): spider_thread = spider.Spider(urls_queue, config_map.get('output_directory', '.'), config_map.get('max_depth', 1), config_map.get('crawl_interval', 1), config_map.get('crawl_timeout', 1), config_map.get('target_url', '.*\\.(gif|png|jpg|bmp)$'), crawled_urls_list, thread_count) thread_list.append(spider_thread) spider_thread.start() # for thread_item in thread_list: # thread_item.join() tips = 'Finished crawling all pages' logging.info(tips) print tips return 0
def test_load_config_success(self): """ test loadconfig function success """ config_file = "spider.conf" resno, resinfo = config_load.load_config(config_file) self.assertEqual(resno, 0)