def __init__(self, job, nodes, rpc_server, context=None, copies=2): self.job = job self.ctx = context or job.context self.nodes = nodes self.mq_client = MessageQueueClient(self.nodes, copies=copies) self.not_registered = self.nodes[:] self.is_ready = False self.stopped = False # destination size self.size = self.ctx.job.size self.limit_size = self.size > 0 self.finishes = 0 # speed limits self.limits = self.ctx.job.limits self.limit_speed = self.limits > 0 self.in_minute = 0 # register rpc server rpc_server.register_function(self.ready, 'ready') rpc_server.register_function(self.complete, 'complete') rpc_server.register_function(self.get_nodes, 'get_nodes') rpc_server.register_function(self.require, 'require') rpc_server.register_function(self.stop, 'stop') rpc_server.register_function(self.add_node, 'add_node') rpc_server.register_function(self.remove_node, 'remove_node') # register signal signal.signal(signal.SIGINT, self.signal_handler) signal.signal(signal.SIGTERM, self.signal_handler)
def __init__(self, job, data_dir, nodes, client=None, context=None, copies=1, force=False): ctx = context or job.context master_port = ctx.job.master_port local = '%s:%s' % (get_ip(), master_port) JobLoader.__init__(self, job, data_dir, local, context=ctx, copies=copies, force=force) LimitionJobLoader.__init__(self, job, context=ctx) # check self.check() self.nodes = nodes self.not_registered = self.nodes[:] self.not_finished = self.nodes[:] # mq self.mq_client = MessageQueueClient(self.nodes, copies=copies) # lock self.ready_lock = threading.Lock() self.ready_lock.acquire() self.finish_lock = threading.Lock() self.finish_lock.acquire() # logger self.logger = get_logger( name='cola_master_%s'%self.job.real_name, filename=os.path.join(self.root, 'job.log'), is_master=True) self.client = client self.client_handler = None if self.client is not None: self.client_handler = add_log_client(self.logger, self.client) self.init_rpc_server() self.init_rate_clear() self.init_logger_server(self.logger) # register rpc server self.rpc_server.register_function(self.client_stop, 'client_stop') self.rpc_server.register_function(self.ready, 'ready') self.rpc_server.register_function(self.worker_finish, 'worker_finish') self.rpc_server.register_function(self.complete, 'complete') self.rpc_server.register_function(self.error, 'error') self.rpc_server.register_function(self.get_nodes, 'get_nodes') self.rpc_server.register_function(self.apply, 'apply') self.rpc_server.register_function(self.require, 'require') self.rpc_server.register_function(self.stop, 'stop') self.rpc_server.register_function(self.add_node, 'add_node') self.rpc_server.register_function(self.remove_node, 'remove_node') # register signal signal.signal(signal.SIGINT, self.signal_handler) signal.signal(signal.SIGTERM, self.signal_handler)
def put_starts(master=None): if master is None: nodes = ['%s:%s' % (get_ip(), getattr(user_config.job, 'port'))] else: nodes = client_call(master, 'get_nodes') mq_client = MessageQueueClient(nodes) with open(keywords_f) as f: keys = [] size = 0 for keyword in f.xreadlines(): keys.append(keyword) size += 1 if size >= PUTSIZE: mq_client.put(keys) size = 0 keys = [] if len(keys) > 0: mq_client.put(keys)
def setUp(self): ports = (11111, 11211, 11311) self.nodes = ['localhost:%s' % port for port in ports] self.dirs = [tempfile.mkdtemp() for _ in range(2 * len(ports))] self.size = len(ports) for i in range(self.size): setattr(self, 'rpc_server%s' % i, ColaRPCServer(('localhost', ports[i]))) setattr( self, 'mq%s' % i, MessageQueue(self.nodes[:], self.nodes[i], getattr(self, 'rpc_server%s' % i))) getattr(self, 'mq%s' % i).init_store(self.dirs[2 * i], self.dirs[2 * i + 1]) thd = threading.Thread(target=getattr(self, 'rpc_server%s' % i).serve_forever) thd.setDaemon(True) thd.start() self.client = MessageQueueClient(self.nodes)
def setUp(self): ports = (11111, 11211, 11311) self.nodes = ['localhost:%s'%port for port in ports] self.dirs = [tempfile.mkdtemp() for _ in range(2*len(ports))] self.size = len(ports) for i in range(self.size): setattr(self, 'rpc_server%s'%i, ColaRPCServer(('localhost', ports[i]))) setattr(self, 'mq%s'%i, MessageQueue(self.nodes[:], self.nodes[i], getattr(self, 'rpc_server%s'%i)) ) getattr(self, 'mq%s'%i).init_store(self.dirs[2*i], self.dirs[2*i+1]) thd = threading.Thread(target=getattr(self, 'rpc_server%s'%i).serve_forever) thd.setDaemon(True) thd.start() self.client = MessageQueueClient(self.nodes)
def put_starts(master=None): if master is None: master = ['%s:%s' % (get_ip(), getattr(user_config.master, 'port'))] print('master:%s' % master) jobs = client_call(master, 'runnable_jobs') app_name = '' for a, j in jobs.items(): if j == "douban movie": app_name = a break if not app_name: raise Exception('douban movie job has not upload') nodes = client_call(master, 'list_workers') addrs = [] default_addr = master.split(':')[0] for ap, s in nodes: a, p = ap.split(':') if a.lower() == 'localhost': addrs.append('%s:%s' % (default_addr, p)) else: addrs.append(ap) mq_client = MessageQueueClient(addrs, app_name) print('get:%s' % mq_client.get()) urls = [] size = 0 for url in starts: urls.append(url) size += 1 if size >= PUTSIZE: mq_client.put(urls) size = 0 urls = [] if len(urls) > 0: mq_client.put(urls)
class Test(unittest.TestCase): def setUp(self): ports = (11111, 11211, 11311) self.nodes = ['localhost:%s'%port for port in ports] self.dirs = [tempfile.mkdtemp() for _ in range(2*len(ports))] self.size = len(ports) for i in range(self.size): setattr(self, 'rpc_server%s'%i, ColaRPCServer(('localhost', ports[i]))) setattr(self, 'mq%s'%i, MessageQueue(self.nodes[:], self.nodes[i], getattr(self, 'rpc_server%s'%i)) ) getattr(self, 'mq%s'%i).init_store(self.dirs[2*i], self.dirs[2*i+1]) thd = threading.Thread(target=getattr(self, 'rpc_server%s'%i).serve_forever) thd.setDaemon(True) thd.start() self.client = MessageQueueClient(self.nodes) def tearDown(self): try: for i in range(self.size): getattr(self, 'rpc_server%s'%i).shutdown() getattr(self, 'mq%s'%i).shutdown() finally: for d in self.dirs: shutil.rmtree(d) def testMQ(self): mq = self.mq0 data = [str(random.randint(10000, 50000)) for _ in range(20)] mq.put(data) gets = [] while True: get = mq.get() if get is None: break gets.append(get) self.assertEqual(sorted(data), sorted(gets)) # test mq client data = str(random.randint(10000, 50000)) self.client.put(data) get = self.client.get() self.assertEqual(data, get) def testAddOrRemoveNode(self): mq = self.mq0 data = [str(i) for i in range(100)] mq.put(data) self.mq2.shutdown() self.assertEqual(len(self.nodes), 3) self.mq0.remove_node(self.nodes[2]) self.assertEqual(len(self.nodes), 3) self.mq1.remove_node(self.nodes[2]) gets = [] while True: get = mq.get() if get is None: break gets.append(get) self.assertEqual(sorted(data), sorted(gets))
class MasterJobLoader(LimitionJobLoader, JobLoader): def __init__(self, job, data_dir, nodes, local_ip=None, client=None, context=None, copies=1, force=False): ctx = context or job.context master_port = ctx.job.master_port if local_ip is None: local_ip = get_ip() else: choices_ips = get_ips() if local_ip not in choices_ips: raise ValueError('IP address must be one of (%s)' % ','.join(choices_ips)) local = '%s:%s' % (local_ip, master_port) JobLoader.__init__(self, job, data_dir, local, context=ctx, copies=copies, force=force) LimitionJobLoader.__init__(self, job, context=ctx) # check self.check() self.nodes = nodes self.not_registered = self.nodes[:] self.not_finished = self.nodes[:] # mq self.mq_client = MessageQueueClient(self.nodes, copies=copies) # lock self.ready_lock = threading.Lock() self.ready_lock.acquire() self.finish_lock = threading.Lock() self.finish_lock.acquire() # logger self.logger = get_logger( name='cola_master_%s'%self.job.real_name, filename=os.path.join(self.root, 'job.log'), is_master=True) self.client = client self.client_handler = None if self.client is not None: self.client_handler = add_log_client(self.logger, self.client) self.init_rpc_server() self.init_rate_clear() self.init_logger_server(self.logger) # register rpc server self.rpc_server.register_function(self.client_stop, 'client_stop') self.rpc_server.register_function(self.ready, 'ready') self.rpc_server.register_function(self.worker_finish, 'worker_finish') self.rpc_server.register_function(self.complete, 'complete') self.rpc_server.register_function(self.error, 'error') self.rpc_server.register_function(self.get_nodes, 'get_nodes') self.rpc_server.register_function(self.apply, 'apply') self.rpc_server.register_function(self.require, 'require') self.rpc_server.register_function(self.stop, 'stop') self.rpc_server.register_function(self.add_node, 'add_node') self.rpc_server.register_function(self.remove_node, 'remove_node') # register signal signal.signal(signal.SIGINT, self.signal_handler) signal.signal(signal.SIGTERM, self.signal_handler) def init_logger_server(self, logger): self.log_server = LogRecordSocketReceiver(host=get_ip(), logger=logger) threading.Thread(target=self.log_server.serve_forever).start() def stop_logger_server(self): if hasattr(self, 'log_server'): self.log_server.shutdown() self.log_server.stop() def client_stop(self): if self.client_handler is not None: self.logger.removeHandler(self.client_handler) def check(self): env_legal = self.check_env(force=self.force) if not env_legal: raise JobMasterRunning('There has been a running job master.') def release_lock(self, lock): try: lock.release() except: pass def finish(self): self.release_lock(self.ready_lock) self.release_lock(self.finish_lock) LimitionJobLoader.finish(self) JobLoader.finish(self) self.stop_logger_server() try: for handler in self.logger.handlers: handler.close() except: pass if self.client is not None: rpc_client = '%s:%s' % ( self.client.split(':')[0], main_conf.client.port ) client_call(rpc_client, 'stop', ignore=True) self.stopped = True def stop(self): for node in self.nodes: try: client_call(node, 'stop') except socket.error: pass self.finish() def signal_handler(self, signum, frame): self.stop() def get_nodes(self): return self.nodes def ready(self, node): if node in self.not_registered: self.not_registered.remove(node) if len(self.not_registered) == 0: self.ready_lock.release() def worker_finish(self, node): if node in self.not_finished: self.not_finished.remove(node) if len(self.not_finished) == 0: self.finish_lock.release() def add_node(self, node): for node in self.nodes: client_call(node, 'add_node', node) self.nodes.append(node) client_call(node, 'run') def remove_node(self, node): for node in self.nodes: client_call(node, 'remove_node', node) self.nodes.remove(node) def run(self): self.ready_lock.acquire() if not self.stopped and len(self.not_registered) == 0: self.mq_client.put(self.job.starts) for node in self.nodes: client_call(node, 'run') self.finish_lock.acquire() try: master_watcher = '%s:%s' % (get_ip(), main_conf.master.port) client_call(master_watcher, 'finish_job', self.job.real_name) except socket.error: pass def __enter__(self): return self def __exit__(self, type_, value, traceback): self.finish()
def __init__(self, job, data_dir, nodes, local_ip=None, client=None, context=None, copies=1, force=False): ctx = context or job.context master_port = ctx.job.master_port if local_ip is None: local_ip = get_ip() else: choices_ips = get_ips() if local_ip not in choices_ips: raise ValueError('IP address must be one of (%s)' % ','.join(choices_ips)) local = '%s:%s' % (local_ip, master_port) JobLoader.__init__(self, job, data_dir, local, context=ctx, copies=copies, force=force) LimitionJobLoader.__init__(self, job, context=ctx) # check self.check() self.nodes = nodes self.not_registered = self.nodes[:] self.not_finished = self.nodes[:] # mq self.mq_client = MessageQueueClient(self.nodes, copies=copies) # lock self.ready_lock = threading.Lock() self.ready_lock.acquire() self.finish_lock = threading.Lock() self.finish_lock.acquire() # logger self.logger = get_logger(name='cola_master_%s' % self.job.real_name, filename=os.path.join(self.root, 'job.log'), is_master=True) self.client = client self.client_handler = None if self.client is not None: self.client_handler = add_log_client(self.logger, self.client) self.init_rpc_server() self.init_rate_clear() self.init_logger_server(self.logger) # register rpc server self.rpc_server.register_function(self.client_stop, 'client_stop') self.rpc_server.register_function(self.ready, 'ready') self.rpc_server.register_function(self.worker_finish, 'worker_finish') self.rpc_server.register_function(self.complete, 'complete') self.rpc_server.register_function(self.error, 'error') self.rpc_server.register_function(self.get_nodes, 'get_nodes') self.rpc_server.register_function(self.apply, 'apply') self.rpc_server.register_function(self.require, 'require') self.rpc_server.register_function(self.stop, 'stop') self.rpc_server.register_function(self.add_node, 'add_node') self.rpc_server.register_function(self.remove_node, 'remove_node') # register signal signal.signal(signal.SIGINT, self.signal_handler) signal.signal(signal.SIGTERM, self.signal_handler)
class MasterJobLoader(LimitionJobLoader, JobLoader): def __init__(self, job, data_dir, nodes, local_ip=None, client=None, context=None, copies=1, force=False): ctx = context or job.context master_port = ctx.job.master_port if local_ip is None: local_ip = get_ip() else: choices_ips = get_ips() if local_ip not in choices_ips: raise ValueError('IP address must be one of (%s)' % ','.join(choices_ips)) local = '%s:%s' % (local_ip, master_port) JobLoader.__init__(self, job, data_dir, local, context=ctx, copies=copies, force=force) LimitionJobLoader.__init__(self, job, context=ctx) # check self.check() self.nodes = nodes self.not_registered = self.nodes[:] self.not_finished = self.nodes[:] # mq self.mq_client = MessageQueueClient(self.nodes, copies=copies) # lock self.ready_lock = threading.Lock() self.ready_lock.acquire() self.finish_lock = threading.Lock() self.finish_lock.acquire() # logger self.logger = get_logger(name='cola_master_%s' % self.job.real_name, filename=os.path.join(self.root, 'job.log'), is_master=True) self.client = client self.client_handler = None if self.client is not None: self.client_handler = add_log_client(self.logger, self.client) self.init_rpc_server() self.init_rate_clear() self.init_logger_server(self.logger) # register rpc server self.rpc_server.register_function(self.client_stop, 'client_stop') self.rpc_server.register_function(self.ready, 'ready') self.rpc_server.register_function(self.worker_finish, 'worker_finish') self.rpc_server.register_function(self.complete, 'complete') self.rpc_server.register_function(self.error, 'error') self.rpc_server.register_function(self.get_nodes, 'get_nodes') self.rpc_server.register_function(self.apply, 'apply') self.rpc_server.register_function(self.require, 'require') self.rpc_server.register_function(self.stop, 'stop') self.rpc_server.register_function(self.add_node, 'add_node') self.rpc_server.register_function(self.remove_node, 'remove_node') # register signal signal.signal(signal.SIGINT, self.signal_handler) signal.signal(signal.SIGTERM, self.signal_handler) def init_logger_server(self, logger): self.log_server = LogRecordSocketReceiver(host=get_ip(), logger=logger) threading.Thread(target=self.log_server.serve_forever).start() def stop_logger_server(self): if hasattr(self, 'log_server'): self.log_server.shutdown() def client_stop(self): if self.client_handler is not None: self.logger.removeHandler(self.client_handler) def check(self): env_legal = self.check_env(force=self.force) if not env_legal: raise JobMasterRunning('There has been a running job master.') def release_lock(self, lock): try: lock.release() except: pass def finish(self): self.release_lock(self.ready_lock) self.release_lock(self.finish_lock) LimitionJobLoader.finish(self) JobLoader.finish(self) self.stop_logger_server() try: for handler in self.logger.handlers: handler.close() except: pass if self.client is not None: rpc_client = '%s:%s' % (self.client.split(':')[0], main_conf.client.port) client_call(rpc_client, 'stop', ignore=True) self.stopped = True def stop(self): for node in self.nodes: client_call(node, 'stop', ignore=True) self.finish() def signal_handler(self, signum, frame): self.stop() def get_nodes(self): return self.nodes def ready(self, node): if node in self.not_registered: self.not_registered.remove(node) if len(self.not_registered) == 0: self.ready_lock.release() def worker_finish(self, node): if node in self.not_finished: self.not_finished.remove(node) if len(self.not_finished) == 0: self.finish_lock.release() def add_node(self, node): for node in self.nodes: client_call(node, 'add_node', node, ignore=True) self.nodes.append(node) client_call(node, 'run', ignore=True) def remove_node(self, node): for node in self.nodes: client_call(node, 'remove_node', node, ignore=True) if node in self.nodes: self.nodes.remove(node) def run(self): self.ready_lock.acquire() if not self.stopped and len(self.not_registered) == 0: self.mq_client.put(self.job.starts) for node in self.nodes: client_call(node, 'run') self.finish_lock.acquire() master_watcher = '%s:%s' % (get_ip(), main_conf.master.port) client_call(master_watcher, 'finish_job', self.job.real_name, ignore=True) def __enter__(self): return self def __exit__(self, type_, value, traceback): self.finish()
class Test(unittest.TestCase): def setUp(self): ports = (11111, 11211, 11311) self.nodes = ['localhost:%s' % port for port in ports] self.dirs = [tempfile.mkdtemp() for _ in range(2 * len(ports))] self.size = len(ports) for i in range(self.size): setattr(self, 'rpc_server%s' % i, ColaRPCServer(('localhost', ports[i]))) setattr( self, 'mq%s' % i, MessageQueue(self.nodes[:], self.nodes[i], getattr(self, 'rpc_server%s' % i))) getattr(self, 'mq%s' % i).init_store(self.dirs[2 * i], self.dirs[2 * i + 1]) thd = threading.Thread(target=getattr(self, 'rpc_server%s' % i).serve_forever) thd.setDaemon(True) thd.start() self.client = MessageQueueClient(self.nodes) def tearDown(self): try: for i in range(self.size): getattr(self, 'rpc_server%s' % i).shutdown() getattr(self, 'mq%s' % i).shutdown() finally: for d in self.dirs: shutil.rmtree(d) def testMQ(self): mq = self.mq0 data = [str(random.randint(10000, 50000)) for _ in range(20)] mq.put(data) gets = [] while True: get = mq.get() if get is None: break gets.append(get) self.assertEqual(sorted(data), sorted(gets)) # test mq client data = str(random.randint(10000, 50000)) self.client.put(data) get = self.client.get() self.assertEqual(data, get) def testAddOrRemoveNode(self): mq = self.mq0 data = [str(i) for i in range(100)] mq.put(data) self.mq2.shutdown() self.assertEqual(len(self.nodes), 3) self.mq0.remove_node(self.nodes[2]) self.assertEqual(len(self.nodes), 3) self.mq1.remove_node(self.nodes[2]) gets = [] while True: get = mq.get() if get is None: break gets.append(get) self.assertEqual(sorted(data), sorted(gets))
class JobLoader(object): def __init__(self, job, nodes, rpc_server, context=None, copies=2): self.job = job self.ctx = context or job.context self.nodes = nodes self.mq_client = MessageQueueClient(self.nodes, copies=copies) self.not_registered = self.nodes[:] self.is_ready = False self.stopped = False # destination size self.size = self.ctx.job.size self.limit_size = self.size > 0 self.finishes = 0 # speed limits self.limits = self.ctx.job.limits self.limit_speed = self.limits > 0 self.in_minute = 0 # register rpc server rpc_server.register_function(self.ready, 'ready') rpc_server.register_function(self.complete, 'complete') rpc_server.register_function(self.get_nodes, 'get_nodes') rpc_server.register_function(self.require, 'require') rpc_server.register_function(self.stop, 'stop') rpc_server.register_function(self.add_node, 'add_node') rpc_server.register_function(self.remove_node, 'remove_node') # register signal signal.signal(signal.SIGINT, self.signal_handler) signal.signal(signal.SIGTERM, self.signal_handler) def ready(self, node): if node in self.not_registered: self.not_registered.remove(node) if len(self.not_registered) == 0: self.is_ready = True def get_nodes(self): return self.nodes def require(self, count): if self.limit_speed: if self.in_minute < self.limit_size: res = max(count, self.limit_size - self.in_minute) self.in_minute += res return res else: return 0 return count if not self.stopped else 0 def complete(self, obj): if self.limit_size: self.finishes += 1 completed = self.finishes >= self.size if completed: self.stopped = True return completed return False if not self.stopped else True def _in_minute_clear(self): def _clear(): self.in_minute = 0 time.sleep(60) if not self.stopped: _clear() thd = threading.Thread(target=_clear) thd.setDaemon(True) thd.start() def signal_handler(self, signum, frame): self.stop() def stop(self): for node in self.nodes: try: client_call(node, 'stop') except socket.error: pass self.stopped = True def run(self): # wait until all the workers initialized while not self.is_ready: pass if self.limit_speed: self._in_minute_clear() self.mq_client.put(self.job.starts) for node in self.nodes: client_call(node, 'run') def _run(): while not self.stopped: time.sleep(TIME_SLEEP) main_thread = threading.Thread(target=_run) main_thread.start() main_thread.join() def add_node(self, node): for node in self.nodes: client_call(node, 'add_node', node) self.nodes.append(node) client_call(node, 'run') def remove_node(self, node): for node in self.nodes: client_call(node, 'remove_node', node) self.nodes.remove(node)