def __init__(self): monkey.patch_all() self.queue = queue.Queue() self.pool = pool.Pool( int(CoreConfigure().get_config_section_map("spider") ['concurrency'])) self.url_table = UrlTable() self.timer = Timer( int(CoreConfigure().get_config_section_map("spider")['timeout']), self.stop) self._stop = event.Event() self.greenlet_finished = event.Event() self.root = None # url_object self.initialize_db()
def __init__(self): monkey.patch_all() self.queue = queue.Queue() self.pool = pool.Pool(int(CoreConfigure().get_config_section_map("spider")['concurrency'])) self.url_table = UrlTable() self.timer = Timer(int(CoreConfigure().get_config_section_map("spider")['timeout']), self.stop) self._stop = event.Event() self.greenlet_finished = event.Event() self.root = None # url_object self.initialize_db()
class Spider(object): def __init__(self): monkey.patch_all() self.queue = queue.Queue() self.pool = pool.Pool(int(CoreConfigure().get_config_section_map("spider")['concurrency'])) self.url_table = UrlTable() self.timer = Timer(int(CoreConfigure().get_config_section_map("spider")['timeout']), self.stop) self._stop = event.Event() self.greenlet_finished = event.Event() self.root = None # url_object self.initialize_db() def initialize_db(self): host = CoreConfigure().get_config_section_map('db')['host'] port = CoreConfigure().get_config_section_map('db')['port'] db_name = CoreConfigure().get_config_section_map('db')['database'] collect_name = CoreConfigure().get_config_section_map('db')['collection'] self.db_cli = MongoClient(host, int(port)) # mongodb collection self.collection = self.db_cli[db_name][collect_name] def set_root(self, url): if isinstance(url, basestring): url = UrlObj(url, type = 0) self.root = url self.push_task(self.root) self.url_table.insert(self.root) def push_task(self, url): # type(url) is UrlObj if url not in self.url_table: self.queue.put(url) def run(self, url = None): begin = time.time() if url is None: # read from configure file for default value url = CoreConfigure().get_config_section_map('content')['root_url'] self.set_root(url) self.timer.start() logger.info("spider begin crawl") while not self.stopped() and self.timer.isAlive(): for greenlet in list(self.pool): if greenlet.dead: self.pool.discard(greenlet) try: url = self.queue.get_nowait() except queue.Empty: if self.pool.free_count() != self.pool.size: # wait until one greenlet finish to flash queue self.greenlet_finished.wait() self.greenlet_finished.clear() continue else: self.stop() greenlet = Handler(url, self) self.pool.start(greenlet) logger.info("total time elapsed %0.2f" % (time.time() - begin)) def stopped(self): return self._stop.is_set() def stop(self): logger.info("spider finish, totally catched (%d) urls" % len(self.url_table)) self.timer.cancel() self._stop.set() self.pool.join() self.queue.put(StopIteration)
class Spider(object): def __init__(self): monkey.patch_all() self.queue = queue.Queue() self.pool = pool.Pool( int(CoreConfigure().get_config_section_map("spider") ['concurrency'])) self.url_table = UrlTable() self.timer = Timer( int(CoreConfigure().get_config_section_map("spider")['timeout']), self.stop) self._stop = event.Event() self.greenlet_finished = event.Event() self.root = None # url_object self.initialize_db() def initialize_db(self): host = CoreConfigure().get_config_section_map('db')['host'] port = CoreConfigure().get_config_section_map('db')['port'] db_name = CoreConfigure().get_config_section_map('db')['database'] collect_name = CoreConfigure().get_config_section_map( 'db')['collection'] self.db_cli = MongoClient(host, int(port)) # mongodb collection self.collection = self.db_cli[db_name][collect_name] def set_root(self, url): if isinstance(url, basestring): url = UrlObj(url, type=0) self.root = url self.push_task(self.root) self.url_table.insert(self.root) def push_task(self, url): # type(url) is UrlObj if url not in self.url_table: self.queue.put(url) def run(self, url=None): begin = time.time() if url is None: # read from configure file for default value url = CoreConfigure().get_config_section_map('content')['root_url'] self.set_root(url) self.timer.start() logger.info("spider begin crawl") while not self.stopped() and self.timer.isAlive(): for greenlet in list(self.pool): if greenlet.dead: self.pool.discard(greenlet) try: url = self.queue.get_nowait() except queue.Empty: if self.pool.free_count() != self.pool.size: # wait until one greenlet finish to flash queue self.greenlet_finished.wait() self.greenlet_finished.clear() continue else: self.stop() greenlet = Handler(url, self) self.pool.start(greenlet) logger.info("total time elapsed %0.2f" % (time.time() - begin)) def stopped(self): return self._stop.is_set() def stop(self): logger.info("spider finish, totally catched (%d) urls" % len(self.url_table)) self.timer.cancel() self._stop.set() self.pool.join() self.queue.put(StopIteration)