class Spider(object): def __init__(self,seed,depth,pool_size=10): self.seed = seed self.depth = depth self.all_url_list = [seed] self.finished_url_list = [] self.failure_url_list = [] self.pool = ThreadPool(pool_size) def crawl(self): base_deep_size = 0 while base_deep_size <= self.depth: for url in self.all_url_list: if url not in self.finished_url_list: self.pool.add_task(self.download,url) self.pool.close() self.depth-=1 def download(self,url): try: data = urllib2.urlopen(url) page = data.read() self.finished_url_list.append(url) links = self.get_urls(page) return page,links except Exception as e: print 'open url:%s raise exception(%s)'%(url,e) return None def get_urls(self,page): soup = BeautifulSoup(page,fromEncoding="gb18030") if soup.title: print soup.title.string links = [] for item in soup.findAll('a'): link=item.get('href') if link and link.startswith('http://') and link not in self.finished_url_list: links.append(link) print links return links def get_next_url(self): pass
def worker_routine(context, idx): """ Worker routine """ global worker_exe_path # Socket to talk to dispatcher socket0 = context.socket(zmq.REP) socket0.connect(url_worker) worker_id = 'Worker-%s' % idx socket0.setsockopt(zmq.IDENTITY, worker_id) socket_exit = context.socket(zmq.SUB) socket_exit.connect(url_worker_cmd) socket_exit.setsockopt(zmq.SUBSCRIBE, '') poller = zmq.Poller() poller.register(socket0, zmq.POLLIN) poller.register(socket_exit, zmq.POLLIN) from threadpool import ThreadPool, WorkRequest thread_pool = ThreadPool(thread_num_of_worker) def thread_work(client_exe_path, msg): # create a process to execute the client_exe # TODO: add execute timeout _cmd = '%s %s' % (client_exe_path, msg) cmd = WrapCommand(_cmd) cmd.start() logger.debug('Start command start %s' % _cmd) cmd.join() logger.debug('Start command end %s' % _cmd) return cmd def thread_work_cb(request, ret): if ret.returncode == 0: # run cmd process in thread success logger.debug("execute %s success %s" % (request.args[0], ret.returncode)) else: logger.error( "execute %s failed %s \n%s\n" % (request.args[0], ret.returncode, ' '.join(ret.results))) def handle_exception(request, exc_info): if not isinstance(exc_info, tuple): # Something is seriously wrong... logger.debug(request) logger.debug(exc_info) raise SystemExit logger.debug("**** Exception occured in request #%s: %s" % \ (request.requestID, exc_info)) def register_task(client_exe_path, msg): request = WorkRequest(thread_work, (client_exe_path, msg), {}, callback=thread_work_cb, exc_callback=handle_exception) thread_pool.putRequest(request) def get_client_exe_paths(client_name): client_exe_paths = [] for client_id in CONF.clients.clients_list: client_conf = getattr(CONF, client_id) client_n = client_conf.name if client_n == client_name: client_exe_paths.append(client_conf.exe_path) return client_exe_paths while True: events = dict(poller.poll(2000)) if events.get(socket0) == zmq.POLLIN: # Deal with message _msg = socket0.recv() print("Received request: [%s]\n" % (_msg)) client_name = _msg.split()[0] msg = _msg[len(client_name):].strip() exe_paths = get_client_exe_paths(client_name) logger.debug("Client name \"%s\" message: %s" % (client_name, msg)) logger.debug("Client exe path %s" % exe_paths) if exe_paths is not None: for exe_path in exe_paths: register_task(exe_path, msg) #send reply back to client socket0.send("OK") else: logger.debug( "Can not execute client, because there have none clients in config file(%s), please check" % str(CONF.clients.clients_list)) socket0.send('FAILED') if events.get(socket_exit) == zmq.POLLIN: cmd = socket_exit.recv() logger.debug('%s CMD %s' % (worker_id, cmd)) sys.stdout.flush() if cmd == 'EXIT': break elif cmd == 'CONFIG_CHANGED': reload_config() else: pass thread_pool.close() thread_pool.join() socket0.close() socket_exit.close()
def worker_routine(context, idx): """ Worker routine """ global worker_exe_path # Socket to talk to dispatcher socket0 = context.socket(zmq.REP) socket0.connect(url_worker) worker_id = 'Worker-%s' % idx socket0.setsockopt(zmq.IDENTITY, worker_id) socket_exit = context.socket(zmq.SUB) socket_exit.connect(url_worker_cmd) socket_exit.setsockopt(zmq.SUBSCRIBE, '') poller = zmq.Poller() poller.register(socket0, zmq.POLLIN) poller.register(socket_exit, zmq.POLLIN) from threadpool import ThreadPool, WorkRequest thread_pool = ThreadPool(thread_num_of_worker) def thread_work(client_exe_path, msg): # create a process to execute the client_exe # TODO: add execute timeout _cmd = '%s %s' % (client_exe_path, msg) cmd = WrapCommand(_cmd) cmd.start() logger.debug('Start command start %s' % _cmd) cmd.join() logger.debug('Start command end %s' % _cmd) return cmd def thread_work_cb(request, ret): if ret.returncode == 0: # run cmd process in thread success logger.debug("execute %s success %s" % (request.args[0], ret.returncode)) else: logger.error("execute %s failed %s \n%s\n" % (request.args[0], ret.returncode, ' '.join(ret.results))) def handle_exception(request, exc_info): if not isinstance(exc_info, tuple): # Something is seriously wrong... logger.debug(request) logger.debug(exc_info) raise SystemExit logger.debug("**** Exception occured in request #%s: %s" % \ (request.requestID, exc_info)) def register_task(client_exe_path, msg): request = WorkRequest(thread_work, (client_exe_path, msg), {}, callback=thread_work_cb, exc_callback=handle_exception) thread_pool.putRequest(request) def get_client_exe_paths(client_name): client_exe_paths = [] for client_id in CONF.clients.clients_list: client_conf = getattr(CONF, client_id) client_n = client_conf.name if client_n == client_name: client_exe_paths.append(client_conf.exe_path) return client_exe_paths while True: events = dict(poller.poll(2000)) if events.get(socket0) == zmq.POLLIN: # Deal with message _msg = socket0.recv() print("Received request: [%s]\n" % (_msg)) client_name = _msg.split()[0] msg = _msg[len(client_name):].strip() exe_paths = get_client_exe_paths(client_name) logger.debug("Client name \"%s\" message: %s" %(client_name, msg)) logger.debug("Client exe path %s" % exe_paths) if exe_paths is not None: for exe_path in exe_paths: register_task(exe_path, msg) #send reply back to client socket0.send("OK") else: logger.debug("Can not execute client, because there have none clients in config file(%s), please check" % str(CONF.clients.clients_list)) socket0.send('FAILED') if events.get(socket_exit) == zmq.POLLIN: cmd = socket_exit.recv() logger.debug('%s CMD %s' % (worker_id, cmd)) sys.stdout.flush() if cmd == 'EXIT': break elif cmd == 'CONFIG_CHANGED': reload_config() else: pass thread_pool.close() thread_pool.join() socket0.close() socket_exit.close()