def main(): """main func: parse sys.argv firstly,and read conf to set global variable. start threads by conf to spider,then stop thread when over. """ log.init_log('./log/spider') conf = opt_parser.opt_parser(sys.argv) """init global variables""" try: conf_parser.conf_parser(conf) except UnboundLocalError as msg: logging.error("Read conf fail. Message: %s" % msg) return """init queue by url file""" lock = threading.Lock() url_queue = Queue.Queue() crawed_urls = set() try: fp = open(gl_value.URL_LIST_FILE) except IOError as msg: logging.error("Open url file %s fail. Message: %s" % (gl_value.URL_LIST_FILE,msg)) return for start_point in fp.readlines(): if not start_point.startswith('http'): break start_url = url_info.Url(start_point.strip('/\n\r')) url_queue.put(start_url) threads = [] """start thread""" for i in xrange(gl_value.THREAD_COUNT): thread = SpiderThread(url_queue,lock,crawed_urls) threads.append(thread) time.sleep(1) thread.start() logging.info("Staring spider thread...") """stop thread""" for thread in threads: thread.join() logging.info("Spider work is done!") print "Spider work is done!"
def test_parser(self): """test parser """ try: conf = conf_parser.conf_parser(self.conf_file) self.assertEqual("./urls", gl_value.URL_LIST_FILE) self.assertEqual("../output", gl_value.OUTPUT_DIRECTORY) self.assertEqual(float(8), gl_value.MAX_DEPTH) self.assertEqual(0.1, gl_value.CRAWL_INTERVAL) self.assertEqual(float(1), gl_value.CRAWL_TIMEOUT) self.assertEqual(12, gl_value.THREAD_COUNT) self.assertEqual(".*.(gif|png|jpg|bmp)$", gl_value.TARGET_URL) except ValueError as msg: print "Try to read conf fail. Message: %s" % msg