Example #1
0
def main():
    """main func:
    parse sys.argv firstly,and read conf to set global variable.
    start threads by conf to spider,then stop thread when over.
    """
    log.init_log('./log/spider') 

    conf = opt_parser.opt_parser(sys.argv)

    """init global variables"""
    try:
        conf_parser.conf_parser(conf)
    except UnboundLocalError as msg:
        logging.error("Read conf fail. Message: %s" % msg)
        return

    """init queue by url file"""
    lock = threading.Lock()
    url_queue = Queue.Queue()
    crawed_urls = set()
    try:
        fp = open(gl_value.URL_LIST_FILE)
    except IOError as msg:
        logging.error("Open url file %s fail. Message: %s" % (gl_value.URL_LIST_FILE,msg))
        return
    for start_point in fp.readlines():
        if not start_point.startswith('http'):
            break
        start_url = url_info.Url(start_point.strip('/\n\r'))
        url_queue.put(start_url)

    threads = []
    """start thread"""
    for i in xrange(gl_value.THREAD_COUNT):
        thread = SpiderThread(url_queue,lock,crawed_urls)
        threads.append(thread)
        time.sleep(1)
        thread.start()
        logging.info("Staring spider thread...")

    """stop thread"""
    for thread in threads:
        thread.join()
    logging.info("Spider work is done!")
    print "Spider work is done!"
    def test_parser(self):
        """test parser
        """
        try:
            conf = conf_parser.conf_parser(self.conf_file)

            self.assertEqual("./urls", gl_value.URL_LIST_FILE)
            self.assertEqual("../output", gl_value.OUTPUT_DIRECTORY) 
            self.assertEqual(float(8), gl_value.MAX_DEPTH)
            self.assertEqual(0.1, gl_value.CRAWL_INTERVAL)
            self.assertEqual(float(1), gl_value.CRAWL_TIMEOUT)
            self.assertEqual(12, gl_value.THREAD_COUNT)
            self.assertEqual(".*.(gif|png|jpg|bmp)$", gl_value.TARGET_URL)

        except ValueError as msg:
            print "Try to read conf fail. Message: %s" % msg
Example #3
0
    def test_parser(self):
        """test parser
        """
        try:
            conf = conf_parser.conf_parser(self.conf_file)

            self.assertEqual("./urls", gl_value.URL_LIST_FILE)
            self.assertEqual("../output", gl_value.OUTPUT_DIRECTORY)
            self.assertEqual(float(8), gl_value.MAX_DEPTH)
            self.assertEqual(0.1, gl_value.CRAWL_INTERVAL)
            self.assertEqual(float(1), gl_value.CRAWL_TIMEOUT)
            self.assertEqual(12, gl_value.THREAD_COUNT)
            self.assertEqual(".*.(gif|png|jpg|bmp)$", gl_value.TARGET_URL)

        except ValueError as msg:
            print "Try to read conf fail. Message: %s" % msg