def setUp(self): self.job = Job('test job', UrlPatterns(), BuiltinOpener, []) self.root = tempfile.mkdtemp() master_root = os.path.join(self.root, 'master') worker_root = os.path.join(self.root, 'worker') os.makedirs(master_root) os.makedirs(worker_root) node = '%s:%s' % (get_ip(), self.job.context.job.port) nodes = [node] master = '%s:%s' % (get_ip(), self.job.context.job.master_port) self.master_loader = MasterJobLoader(self.job, master_root, nodes) self.worker_loader = WorkerJobLoader(self.job, worker_root, master)
def add_arguments(self, parser): ip = get_ip() self.master_parser = parser.add_parser("master", help="master commands") self.master_parser.add_argument("-w", "--working", metavar="working dir", nargs="?", help="master working dir") self.master_parser.add_argument( "-s", "--start", metavar="master address", nargs="?", const=ip, help="master address(in the former of `ip:port` or `ip`)", ) self.master_parser.add_argument( "-k", "--kill", metavar="master address", nargs="?", const=ip, help="master to kill(in the former of `ip:port` or `ip`)", ) self.master_parser.add_argument( "-l", "--list", metavar="master address", nargs="?", const=ip, help="list workers(in the former of `ip:port` or `ip`)", ) self.master_parser.set_defaults(func=self.run)
def __init__(self, port, nodes_ip_addresses, worker_port, popen=None): self.job_master = '%s:%s' % (get_ip(), port) self.nodes = [ '%s:%s' % (node_ip, worker_port) for node_ip in nodes_ip_addresses ] self.worker_port = worker_port self.popen = None
def __init__(self, master, root, zip_dir, job_dir, data_path=None, force=False): self.master = master self.host = get_ip() self.port = main_conf.worker.port self.node = '%s:%s' % (self.host, self.port) self.root = root self.zip_dir = zip_dir self.job_dir = job_dir self.data_path = data_path self.force = force self.stopped = False self.running_jobs = {} self.check(force=force) self.init_rpc_server() self.rpc_server.register_function(self.stop, 'stop') self.rpc_server.register_function(self.kill, 'kill') self.rpc_server.register_function(self.start_job, 'start_job') self.rpc_server.register_function(self.clear_job, 'clear_job') self.set_file_receiver(self.zip_dir)
def __init__(self, root, zip_dir, job_dir, ip_address=None, data_path=None, force=False): self.root = root self.zip_dir = zip_dir self.job_dir = job_dir self.data_path = data_path self.force = force self.nodes_watchers = {} self.running_jobs = {} self.black_list = [] if ip_address is None: ip_address = get_ip() else: choices_ips = get_ips() if ip_address not in choices_ips: raise ValueError("IP address must be one of (%s)" % ",".join(choices_ips)) self.ip_address = ip_address self.port = main_conf.master.port self.stopped = False self.check(force=force) self.init_rpc_server() self.rpc_server.register_function(self.register_watcher_heartbeat, "register_heartbeat") self.rpc_server.register_function(self.stop, "stop") self.rpc_server.register_function(self.list_jobs, "list_jobs") self.rpc_server.register_function(self.start_job, "start_job") self.rpc_server.register_function(self.stop_job, "stop_job") self.rpc_server.register_function(self.finish_job, "finish_job") self.rpc_server.register_function(self.clear_job, "clear_job") self.rpc_server.register_function(self.list_job_dirs, "list_job_dirs") self.rpc_server.register_function(self.list_workers, "list_workers") self.set_receiver(zip_dir)
def __init__(self, port, nodes_ip_addresses, worker_port, popen=None): self.job_master = '%s:%s' % (get_ip(), port) self.nodes = [ '%s:%s'%(node_ip, worker_port) for node_ip in nodes_ip_addresses ] self.worker_port = worker_port self.popen = None
def load_job(path, nodes, context=None): if not os.path.exists(path): raise ValueError('Job definition does not exist.') job = import_job(path) job_name = job.name.replace(' ', '_') if job.debug: job_name += '_debug' holder = os.path.join(root_dir(), 'data', 'master', 'jobs', job_name) if not os.path.exists(holder): os.makedirs(holder) lock_f = os.path.join(holder, 'lock') if os.path.exists(lock_f): raise JobMasterRunning('There has been a running job master') open(lock_f, 'w').close() rpc_server = create_rpc_server(job) try: loader = JobLoader(job, nodes, rpc_server, context=context) loader.run() # nofify master watcher finishing master_watcher = '%s:%s' % (get_ip(), main_conf.master.port) client_call(master_watcher, 'finish_job', job.real_name) finally: os.remove(lock_f) rpc_server.shutdown()
def start_master(): path = os.path.join(root_dir(), 'cola', 'master', 'watcher.py') print 'Start master at %s:%s' % (get_ip(), main_conf.master.port) print 'Master will run in background.' subprocess.Popen(['python', path])
def create_rpc_server(job, context=None): ctx = context or job.context rpc_server = ColaRPCServer((get_ip(), ctx.job.master_port)) thd = threading.Thread(target=rpc_server.serve_forever) thd.setDaemon(True) thd.start() return rpc_server
def add_arguments(self, parser): ip = get_ip() self.worker_parser = parser.add_parser('worker', help='worker commands') self.worker_parser.add_argument( '-m', '--master', metavar='master address', nargs='?', default=ip, help='master connected to(in the former of `ip:port` or `ip`)') self.worker_parser.add_argument( '-s', '--start', metavar='worker address', nargs='?', const=ip, help='local worker connected to(in the former of `ip:port` or `ip`' ) self.worker_parser.add_argument('-w', '--working', metavar='working dir', nargs='?', help='worker working dir') self.worker_parser.set_defaults(func=self.run)
def __init__(self, root, zip_dir, job_dir, data_path=None, force=False): self.root = root self.zip_dir = zip_dir self.job_dir = job_dir self.data_path = data_path self.force = force self.nodes_watchers = {} self.running_jobs = {} self.black_list = [] self.ip_address = get_ip() self.port = main_conf.master.port self.stopped = False self.check(force=force) self.init_rpc_server() self.rpc_server.register_function(self.register_watcher_heartbeat, 'register_heartbeat') self.rpc_server.register_function(self.stop, 'stop') self.rpc_server.register_function(self.list_jobs, 'list_jobs') self.rpc_server.register_function(self.start_job, 'start_job') self.rpc_server.register_function(self.stop_job, 'stop_job') self.rpc_server.register_function(self.finish_job, 'finish_job') self.rpc_server.register_function(self.clear_job, 'clear_job') self.rpc_server.register_function(self.list_job_dirs, 'list_job_dirs') self.rpc_server.register_function(self.list_workers, 'list_workers') self.set_receiver(zip_dir)
def create_rpc_server(job, context=None): ctx = context or job.context rpc_server = ColaRPCServer((get_ip(), ctx.job.port)) thd = threading.Thread(target=rpc_server.serve_forever) thd.setDaemon(True) thd.start() return rpc_server
def add_arguments(self, parser): ip = get_ip() self.master_parser = parser.add_parser('master', help='master commands') self.master_parser.add_argument( '-s', '--start', metavar='start master', nargs='?', const=ip, help='master address(in the former of `ip:port` or `ip`)') self.master_parser.add_argument( '-k', '--kill', metavar='kill master', nargs='?', const=ip, help='master to kill(in the former of `ip:port` or `ip`)') self.master_parser.add_argument( '-l', '--list', metavar='list workers', nargs='?', const=ip, help='list workers(in the former of `ip:port` or `ip`)') self.master_parser.set_defaults(func=self.run)
def start_log_server(): global log_server global log_server_port if log_server is not None: return log_server = LogRecordSocketReceiver(logger=logger, host=get_ip(), port=log_server_port) threading.Thread(target=log_server.serve_forever).start()
def add_arguments(self, parser): ip = get_ip() self.worker_parser = parser.add_parser('worker', help='worker commands') self.worker_parser.add_argument('-m', '--master', metavar='master address', nargs='?', default=ip, help='master connected to(in the former of `ip:port` or `ip`)') self.worker_parser.add_argument('-s', '--start', metavar='worker address', nargs='?', const=ip, help='local worker connected to(in the former of `ip:port` or `ip`') self.worker_parser.set_defaults(func=self.run)
def __init__(self, job, data_dir, nodes, client=None, context=None, copies=1, force=False): ctx = context or job.context master_port = ctx.job.master_port local = '%s:%s' % (get_ip(), master_port) JobLoader.__init__(self, job, data_dir, local, context=ctx, copies=copies, force=force) LimitionJobLoader.__init__(self, job, context=ctx) # check self.check() self.nodes = nodes self.not_registered = self.nodes[:] self.not_finished = self.nodes[:] # mq self.mq_client = MessageQueueClient(self.nodes, copies=copies) # lock self.ready_lock = threading.Lock() self.ready_lock.acquire() self.finish_lock = threading.Lock() self.finish_lock.acquire() # logger self.logger = get_logger( name='cola_master_%s'%self.job.real_name, filename=os.path.join(self.root, 'job.log'), is_master=True) self.client = client self.client_handler = None if self.client is not None: self.client_handler = add_log_client(self.logger, self.client) self.init_rpc_server() self.init_rate_clear() self.init_logger_server(self.logger) # register rpc server self.rpc_server.register_function(self.client_stop, 'client_stop') self.rpc_server.register_function(self.ready, 'ready') self.rpc_server.register_function(self.worker_finish, 'worker_finish') self.rpc_server.register_function(self.complete, 'complete') self.rpc_server.register_function(self.error, 'error') self.rpc_server.register_function(self.get_nodes, 'get_nodes') self.rpc_server.register_function(self.apply, 'apply') self.rpc_server.register_function(self.require, 'require') self.rpc_server.register_function(self.stop, 'stop') self.rpc_server.register_function(self.add_node, 'add_node') self.rpc_server.register_function(self.remove_node, 'remove_node') # register signal signal.signal(signal.SIGINT, self.signal_handler) signal.signal(signal.SIGTERM, self.signal_handler)
def __init__(self, job, data_dir, context=None, logger=None, local=None, nodes=None, copies=1, force=False): self.job = job ctx = context or self.job.context self.local = local if self.local is None: host, port = get_ip(), ctx.job.port self.local = '%s:%s' % (host, port) else: host, port = tuple(self.local.split(':', 1)) self.nodes = nodes if self.nodes is None: self.nodes = [self.local] self.logger = logger self.info_logger = get_logger(name='cola_worker_info_%s' % self.job.real_name) super(BasicWorkerJobLoader, self).__init__(self.job, data_dir, self.local, context=ctx, copies=copies, force=force) # instances count that run at the same time self.instances = max(min(self.ctx.job.instances, MAX_THREADS_SIZE), 1) # excecutings self.executings = [] # exception times that continously throw self.error_times = 0 # budget self.budget = 0 self.check() # init rpc server self.init_rpc_server() # init message queue self.init_mq() # register signal signal.signal(signal.SIGINT, self.signal_handler) signal.signal(signal.SIGTERM, self.signal_handler) self.rpc_server.register_function(self.stop, name='stop') self.rpc_server.register_function(self.add_node, name='add_node') self.rpc_server.register_function(self.remove_node, name='remove_node') self.rpc_server.register_function(self.run, name='run')
def __init__(self, job, data_dir, context=None, logger=None, local=None, nodes=None, copies=1, force=False): self.job = job ctx = context or self.job.context self.local = local if self.local is None: host, port = get_ip(), ctx.job.port self.local = '%s:%s' % (host, port) else: host, port = tuple(self.local.split(':', 1)) self.nodes = nodes if self.nodes is None: self.nodes = [self.local] self.logger = logger self.info_logger = get_logger( name='cola_worker_info_%s'%self.job.real_name) super(BasicWorkerJobLoader, self).__init__( self.job, data_dir, self.local, context=ctx, copies=copies, force=force) # instances count that run at the same time self.instances = max(min(self.ctx.job.instances, MAX_THREADS_SIZE), 1) # excecutings self.executings = [] # exception times that continously throw self.error_times = 0 # budget self.budget = 0 # counter self.pages_size = 0 # lock when not stopped self.stop_lock = threading.Lock() self.stop_lock.acquire() self.check() # init rpc server self.init_rpc_server() # init message queue self.init_mq() # register signal signal.signal(signal.SIGINT, self.signal_handler) signal.signal(signal.SIGTERM, self.signal_handler) self.rpc_server.register_function(self.stop, name='stop') self.rpc_server.register_function(self.add_node, name='add_node') self.rpc_server.register_function(self.remove_node, name='remove_node') self.rpc_server.register_function(self.run, name='run') self.rpc_server.register_function(self.pages, name='pages')
def add_arguments(self, parser): ip = get_ip() self.master_parser = parser.add_parser('master', help='master commands') self.master_parser.add_argument('-s', '--start', metavar='start master', nargs='?', const=ip, help='master address(in the former of `ip:port` or `ip`)') self.master_parser.add_argument('-k', '--kill', metavar='kill master', nargs='?', const=ip, help='master to kill(in the former of `ip:port` or `ip`)') self.master_parser.add_argument('-l', '--list', metavar='list workers', nargs='?', const=ip, help='list workers(in the former of `ip:port` or `ip`)') self.master_parser.set_defaults(func=self.run)
def run(self): self.ready_lock.acquire() if not self.stopped and len(self.not_registered) == 0: self.mq_client.put(self.job.starts) for node in self.nodes: client_call(node, 'run') self.finish_lock.acquire() master_watcher = '%s:%s' % (get_ip(), main_conf.master.port) client_call(master_watcher, 'finish_job', self.job.real_name, ignore=True)
def __init__(self, local_mode=False, is_master=False, master_addr=None, is_client=False, working_dir=None, mkdirs=False, ip=None, ips=None): self.is_local_mode = local_mode self.is_master = is_master self.is_client = is_client self.master_addr = master_addr self.master_ip = self.master_addr if not self.is_local_mode: if self.master_addr is None: raise ValueError('Master address must be supplied when local_mode is False') if ':' not in self.master_addr: self.master_addr = '%s:%s' % (self.master_addr, main_conf.master.port) else: self.master_ip = self.master_addr.split(':', 1)[0] self.working_dir = working_dir if self.working_dir is None: tmp = tempfile.gettempdir() self.working_dir = os.path.join(tmp, 'cola') if mkdirs and not os.path.exists(self.working_dir): os.makedirs(self.working_dir) self.ip = ip if self.ip is None: if self.is_master: self.ip = self.master_ip else: self.ip = get_ip() if self.is_local_mode and not self.ip: self.ip = '127.0.0.1' if self.master_addr is None: self.master_addr = '%s:%s' % (self.ip, main_conf.master.port) self.worker_addr = '%s:%s' % (self.ip, main_conf.worker.port) self.ips = ips if ips is not None else [] if not self.ips: self.ips.append(self.ip) self.addrs = [self.fix_addr(_ip) for _ip in self.ips] self.manager = ContextManager() self.manager.start(manager_init) self.env = self.manager.dict({'ip': self.ip, 'root': self.working_dir, 'is_local': self.is_local_mode, 'master_ip': self.master_ip, 'job_desc' : {} }) self.logger = get_logger('cola_context') self.master_rpc_server = None self.worker_rpc_server = None
def start_worker(master, data_path=None, force=False): path = os.path.join(root_dir(), 'cola', 'worker', 'watcher.py') print 'Start worker at %s:%s' % (get_ip(), main_conf.worker.port) print 'Worker will run in background. Please do not shut down the terminal.' cmds = ['python', path, '-m', master] if data_path is not None: cmds.extend(['-d', data_path]) if force is True: cmds.append('-f') subprocess.Popen(cmds)
def start_master(data_path=None, force=False): path = os.path.join(root_dir(), "cola", "master", "watcher.py") print "Start master at %s:%s" % (get_ip(), main_conf.master.port) print "Master will run in background. Please do not shut down the terminal." cmds = ["python", path] if data_path is not None: cmds.extend(["-d", data_path]) if force is True: cmds.append("-f") subprocess.Popen(cmds)
def __init__(self, rpc_server, master, zip_dir, job_dir): self.rpc_server = rpc_server self.master = master self.node = '%s:%s' % (get_ip(), main_conf.worker.port) self.zip_dir = zip_dir self.job_dir = job_dir self.stopped = False self.rpc_server.register_function(self.stop, 'stop') self.rpc_server.register_function(self.start_job, 'start_job') self.rpc_server.register_function(self.clear_job, 'clear_job') self.set_file_receiver(self.zip_dir)
def load_job(path, master=None): if not os.path.exists(path): raise ValueError('Job definition does not exist.') job = import_job(path) holder = os.path.join(root_dir(), 'data', 'worker', 'jobs', job.real_name) mq_holder = os.path.join(holder, 'mq') if not os.path.exists(mq_holder): os.makedirs(mq_holder) # Logger logger = get_logger(os.path.join(holder, 'job.log')) local_node = '%s:%s' % (get_ip(), job.context.job.port) nodes = [local_node] if master is not None: nodes = client_call(master, 'get_nodes') # Bloom filter hook bloom_filter_file = os.path.join(holder, 'bloomfilter') bloom_filter_hook = create_bloom_filter_hook(bloom_filter_file, job) rpc_server = create_rpc_server(job) loader = JobLoader(job, rpc_server, logger=logger, master=master) loader.init_mq(nodes, local_node, mq_holder, verify_exists_hook=bloom_filter_hook, copies=2 if master else 1) if master is None: try: loader.mq.put(job.starts) loader.run() finally: rpc_server.shutdown() else: try: client_call(master, 'ready', local_node) def _start(): while not loader.stopped: time.sleep(TIME_SLEEP) loader.run() thread = threading.Thread(target=_start) thread.start() thread.join() finally: rpc_server.shutdown()
def __init__(self, container_id, working_dir, job_path, job_name, env, mq, counter_server, budget_server, speed_server, stopped, nonsuspend, idle_statuses, n_tasks=1, is_local=False, master_ip=None, logger=None, task_start_id=0): self.container_id = container_id self.working_dir = working_dir self.mq = mq self.env = env self.job_name = job_name self.job_desc = env['job_desc'].get(job_name) or \ import_job_desc(job_path) self.counter_server = counter_server self.budget_server = budget_server self.speed_server = speed_server self.stopped = stopped self.nonsuspend = nonsuspend self.idle_statuses = idle_statuses self.n_tasks = n_tasks self.is_local = is_local self.master_ip = master_ip self.logger = logger self.task_start_id = task_start_id self.ip = self.env.get('ip', None) or get_ip() self.counter_clients = [None for _ in range(self.n_tasks)] self.budget_clients = [None for _ in range(self.n_tasks)] self.speed_clients = [None for _ in range(self.n_tasks)] self.task_threads = [] self.inited = False self.lock = multiprocessing.Lock()
def load_job(path, master=None): if not os.path.exists(path): raise ValueError('Job definition does not exist.') job = import_job(path) holder = os.path.join( root_dir(), 'data', 'worker', 'jobs', job.real_name) mq_holder = os.path.join(holder, 'mq') if not os.path.exists(mq_holder): os.makedirs(mq_holder) # Logger logger = get_logger(os.path.join(holder, 'job.log')) local_node = '%s:%s' % (get_ip(), job.context.job.port) nodes = [local_node] if master is not None: nodes = client_call(master, 'get_nodes') # Bloom filter hook bloom_filter_file = os.path.join(holder, 'bloomfilter') bloom_filter_hook = create_bloom_filter_hook(bloom_filter_file, job) rpc_server = create_rpc_server(job) loader = JobLoader(job, rpc_server, logger=logger, master=master) loader.init_mq(nodes, local_node, mq_holder, verify_exists_hook=bloom_filter_hook, copies=2 if master else 1) if master is None: try: loader.mq.put(job.starts) loader.run() finally: rpc_server.shutdown() else: try: client_call(master, 'ready', local_node) def _start(): while not loader.stopped: time.sleep(TIME_SLEEP) loader.run() thread = threading.Thread(target=_start) thread.start() thread.join() finally: rpc_server.shutdown()
def start_master(ip=None, data_path=None, force=False): path = os.path.join(root_dir(), 'cola', 'master', 'watcher.py') ip_str = ip if ip is not None else get_ip() print 'Start master at %s:%s' % (ip_str, main_conf.master.port) print 'Master will run in background. Please do not shut down the terminal.' cmds = ['python', path] if ip is not None: cmds.extend(['-i', ip]) if data_path is not None: cmds.extend(['-d', data_path]) if force is True: cmds.append('-f') subprocess.Popen(cmds)
def add_arguments(self, parser): ip = get_ip() self.job_parser = parser.add_parser('job', help='job commands') self.job_parser.add_argument( '-m', '--master', metavar='master address', nargs='?', default=ip, help='master connected to(in the former of `ip:port` or `ip`)') self.job_parser.add_argument( '-l', '--list', action='store_true', help='list all jobs including <id> <name> and <status>') self.job_parser.add_argument('-k', '--kill', metavar='job name', nargs='?', help='kill job by job name') self.job_parser.add_argument( '-u', '--upload', metavar='job directory', nargs='?', help='upload a job directory to the cluster') self.job_parser.add_argument( '-r', '--run', metavar='job name', nargs='?', const='U', help='run a job by the job id or with the `upload` command') self.job_parser.add_argument( '-t', '--status', metavar='job name', nargs='?', help='show the status of a job, and the counters if it\'s running') self.job_parser.add_argument( '-p', '--package', metavar='job_name', nargs='?', help= 'package the running info of a job including log and errors infos') self.job_parser.set_defaults(func=self.run)
def start_rpc_server(): global rpc_server global rpc_server_thread if rpc_server is not None and \ rpc_server_thread is not None: return rpc_server_thread rpc_server = ColaRPCServer((get_ip(), main_conf.client.port)) rpc_server.register_function(stop) thd = threading.Thread(target=rpc_server.serve_forever) thd.setDaemon(True) thd.start() rpc_server_thread = thd return rpc_server_thread
def add_arguments(self, parser): ip = get_ip() self.job_parser = parser.add_parser('job', help='job commands') self.job_parser.add_argument('-m', '--master', metavar='master address', nargs='?', default=ip, help='master connected to(in the former of `ip:port` or `ip`)') self.job_parser.add_argument('-l', '--list', action='store_true', help='list all jobs including <id> <name> and <status>' ) self.job_parser.add_argument('-k', '--kill', metavar='kill some job', nargs='?', help='kill job by job name') self.job_parser.add_argument('-u', '--upload', metavar='upload a job', nargs='?', help='upload a job directory to the cluster') self.job_parser.add_argument('-r', '--run', metavar='run a job', nargs='?', const='U', help='run a job by the job id or with the `upload` command') self.job_parser.add_argument('-t', '--status', metavar='get the status of a job', nargs='?', help='show the status of a job, and the counters if it\'s running') self.job_parser.add_argument('-p', '--package', metavar='package a job', nargs='?', help='package the running info of a job including log and errors infos') self.job_parser.set_defaults(func=self.run)
def put_starts(master=None): if master is None: nodes = ['%s:%s' % (get_ip(), getattr(user_config.job, 'port'))] else: nodes = client_call(master, 'get_nodes') mq_client = MessageQueueClient(nodes) with open(keywords_f) as f: keys = [] size = 0 for keyword in f.xreadlines(): keys.append(keyword) size += 1 if size >= PUTSIZE: mq_client.put(keys) size = 0 keys = [] if len(keys) > 0: mq_client.put(keys)
def __init__(self, root, zip_dir, job_dir, ip_address=None, data_path=None, force=False): self.root = root self.zip_dir = zip_dir self.job_dir = job_dir self.data_path = data_path self.force = force self.nodes_watchers = {} self.running_jobs = {} self.black_list = [] if ip_address is None: ip_address = get_ip() else: choices_ips = get_ips() if ip_address not in choices_ips: raise ValueError('IP address must be one of (%s)' % ','.join(choices_ips)) self.ip_address = ip_address self.port = main_conf.master.port self.stopped = False self.check(force=force) self.init_rpc_server() self.rpc_server.register_function(self.register_watcher_heartbeat, 'register_heartbeat') self.rpc_server.register_function(self.stop, 'stop') self.rpc_server.register_function(self.list_jobs, 'list_jobs') self.rpc_server.register_function(self.start_job, 'start_job') self.rpc_server.register_function(self.stop_job, 'stop_job') self.rpc_server.register_function(self.finish_job, 'finish_job') self.rpc_server.register_function(self.clear_job, 'clear_job') self.rpc_server.register_function(self.list_job_dirs, 'list_job_dirs') self.rpc_server.register_function(self.list_workers, 'list_workers') self.set_receiver(zip_dir)
def load_job(job_path, data_path=None, master=None, force=False): if not os.path.exists(job_path): raise ValueError("Job definition does not exist.") job = import_job(job_path) if data_path is None: data_path = os.path.join(root_dir(), "data") root = os.path.join(data_path, "worker", "jobs", job.real_name) if not os.path.exists(root): os.makedirs(root) if master is None: with StandaloneWorkerJobLoader(job, root, force=force) as job_loader: job_loader.run() else: nodes = client_call(master, "get_nodes") local = "%s:%s" % (get_ip(), job.context.job.port) client_call(master, "ready", local) with WorkerJobLoader(job, root, master, local=local, nodes=nodes, force=force) as job_loader: client_call(master, "ready", local) job_loader.ready_for_run()
def load_job(job_path, data_path=None, master=None, force=False): if not os.path.exists(job_path): raise ValueError('Job definition does not exist.') job = import_job(job_path) if data_path is None: data_path = os.path.join(root_dir(), 'data') root = os.path.join(data_path, 'worker', 'jobs', job.real_name) if not os.path.exists(root): os.makedirs(root) if master is None: with StandaloneWorkerJobLoader(job, root, force=force) as job_loader: job_loader.run() else: nodes = client_call(master, 'get_nodes') local = '%s:%s' % (get_ip(), job.context.job.port) client_call(master, 'ready', local) with WorkerJobLoader(job, root, master, local=local, nodes=nodes, force=force) \ as job_loader: client_call(master, 'ready', local) job_loader.ready_for_run()
def load_job(job_path, data_path=None, master=None, force=False): if not os.path.exists(job_path): raise ValueError('Job definition does not exist.') job = import_job(job_path) if data_path is None: data_path = os.path.join(root_dir(), 'data') root = os.path.join( data_path, 'worker', 'jobs', job.real_name) if not os.path.exists(root): os.makedirs(root) if master is None: with StandaloneWorkerJobLoader(job, root, force=force) as job_loader: job_loader.run() else: nodes = client_call(master, 'get_nodes') local = '%s:%s' % (get_ip(), job.context.job.port) client_call(master, 'ready', local) with WorkerJobLoader(job, root, master, local=local, nodes=nodes, force=force) \ as job_loader: client_call(master, 'ready', local) job_loader.ready_for_run()
def __init__(self, rpc_server, zip_dir, job_dir): self.rpc_server = rpc_server self.zip_dir = zip_dir self.job_dir = job_dir self.nodes_watchers = {} self.running_jobs = {} self.black_list = [] self.ip_address = get_ip() self.stopped = False self.rpc_server.register_function(self.register_watcher_heartbeat, 'register_heartbeat') self.rpc_server.register_function(self.stop, 'stop') self.rpc_server.register_function(self.list_jobs, 'list_jobs') self.rpc_server.register_function(self.start_job, 'start_job') self.rpc_server.register_function(self.stop_job, 'stop_job') self.rpc_server.register_function(self.finish_job, 'finish_job') self.rpc_server.register_function(self.clear_job, 'clear_job') self.rpc_server.register_function(self.list_job_dirs, 'list_job_dirs') self.rpc_server.register_function(self.list_workers, 'list_workers') self.set_receiver(zip_dir)
def put_starts(master=None): if master is None: master = ['%s:%s' % (get_ip(), getattr(user_config.master, 'port'))] print('master:%s' % master) jobs = client_call(master, 'runnable_jobs') app_name = '' for a, j in jobs.items(): if j == "douban movie": app_name = a break if not app_name: raise Exception('douban movie job has not upload') nodes = client_call(master, 'list_workers') addrs = [] default_addr = master.split(':')[0] for ap, s in nodes: a, p = ap.split(':') if a.lower() == 'localhost': addrs.append('%s:%s' % (default_addr, p)) else: addrs.append(ap) mq_client = MessageQueueClient(addrs, app_name) print('get:%s' % mq_client.get()) urls = [] size = 0 for url in starts: urls.append(url) size += 1 if size >= PUTSIZE: mq_client.put(urls) size = 0 urls = [] if len(urls) > 0: mq_client.put(urls)
def create_rpc_server(): rpc_server = ColaRPCServer((get_ip(), main_conf.master.port)) thd = threading.Thread(target=rpc_server.serve_forever) thd.setDaemon(True) thd.start() return rpc_server
def register(func): func_name = func.__name__ name = '-%s' % func_name.replace('_', '-').strip('-') help_ = func.__doc__.strip() registered_func[func_name] = func parser.add_argument(name, nargs='*', dest=func_name, default=argparse.SUPPRESS, help=help_) def inner(master, *args, **kwargs): return func(master, *args, **kwargs) return inner log_server = None log_server_port = 9120 client = '%s:%s' % (get_ip(), log_server_port) def start_log_server(): global log_server global log_server_port if log_server is not None: return log_server = LogRecordSocketReceiver(logger=logger, host=get_ip(), port=log_server_port) threading.Thread(target=log_server.serve_forever).start() def stop_log_server(): global log_server if log_server is None: return
default=None, const=None, help='root directory to put data') parser.add_argument('-f', '--force', metavar='force start', nargs='?', default=False, const=True, type=bool) args = parser.parse_args() master = args.master if master is None: connect_to_localhost = raw_input("Connect to localhost? (yes or no) ") conn = connect_to_localhost.lower().strip() if conn == 'yes' or conn == 'y': master = '%s:%s' % (get_ip(), main_conf.master.port) elif conn == 'no' or conn == 'n': master = raw_input( "Please input the master(form: \"ip:port\" or \"ip\") ") if ':' not in master: master += ':%s' % main_conf.master.port else: print 'Input illegal!' else: if ':' not in master: master += ':%s' % main_conf.master.port if master is not None: start_worker(master, data_path=args.data, force=args.force)
Created on 2013-6-27 @author: Chine ''' import socket import os from cola.core.rpc import client_call from cola.core.utils import get_ip from cola.core.logs import get_logger from cola.worker.recover import recover from conf import user_config logger = get_logger(name='weibosearch_stop') if __name__ == '__main__': ip, port = get_ip(), getattr(user_config.job, 'port') logger.info('Trying to stop single running worker') try: client_call('%s:%s' % (ip, port), 'stop') except socket.error: stop = raw_input("Force to stop? (y or n) ").strip() if stop == 'y' or stop == 'yes': job_path = os.path.split(os.path.abspath(__file__))[0] recover() else: print 'ignore' logger.info('Successfully stopped single running worker')
def __init__(self, job, data_dir, nodes, local_ip=None, client=None, context=None, copies=1, force=False): ctx = context or job.context master_port = ctx.job.master_port if local_ip is None: local_ip = get_ip() else: choices_ips = get_ips() if local_ip not in choices_ips: raise ValueError('IP address must be one of (%s)' % ','.join(choices_ips)) local = '%s:%s' % (local_ip, master_port) JobLoader.__init__(self, job, data_dir, local, context=ctx, copies=copies, force=force) LimitionJobLoader.__init__(self, job, context=ctx) # check self.check() self.nodes = nodes self.not_registered = self.nodes[:] self.not_finished = self.nodes[:] # mq self.mq_client = MessageQueueClient(self.nodes, copies=copies) # lock self.ready_lock = threading.Lock() self.ready_lock.acquire() self.finish_lock = threading.Lock() self.finish_lock.acquire() # logger self.logger = get_logger(name='cola_master_%s' % self.job.real_name, filename=os.path.join(self.root, 'job.log'), is_master=True) self.client = client self.client_handler = None if self.client is not None: self.client_handler = add_log_client(self.logger, self.client) self.init_rpc_server() self.init_rate_clear() self.init_logger_server(self.logger) # register rpc server self.rpc_server.register_function(self.client_stop, 'client_stop') self.rpc_server.register_function(self.ready, 'ready') self.rpc_server.register_function(self.worker_finish, 'worker_finish') self.rpc_server.register_function(self.complete, 'complete') self.rpc_server.register_function(self.error, 'error') self.rpc_server.register_function(self.get_nodes, 'get_nodes') self.rpc_server.register_function(self.apply, 'apply') self.rpc_server.register_function(self.require, 'require') self.rpc_server.register_function(self.stop, 'stop') self.rpc_server.register_function(self.add_node, 'add_node') self.rpc_server.register_function(self.remove_node, 'remove_node') # register signal signal.signal(signal.SIGINT, self.signal_handler) signal.signal(signal.SIGTERM, self.signal_handler)
def __init__(self, local_mode=False, is_master=False, master_addr=None, is_client=False, working_dir=None, mkdirs=False, ip=None, ips=None): self.is_local_mode = local_mode self.is_master = is_master self.is_client = is_client self.master_addr = master_addr self.master_ip = self.master_addr if not self.is_local_mode: if self.master_addr is None: raise ValueError( 'Master address must be supplied when local_mode is False') if ':' not in self.master_addr: self.master_addr = '%s:%s' % (self.master_addr, main_conf.master.port) else: self.master_ip = self.master_addr.split(':', 1)[0] self.working_dir = working_dir if self.working_dir is None: tmp = tempfile.gettempdir() self.working_dir = os.path.join(tmp, 'cola') if mkdirs and not os.path.exists(self.working_dir): os.makedirs(self.working_dir) self.ip = ip if self.ip is None: if self.is_master: self.ip = self.master_ip else: self.ip = get_ip() if self.is_local_mode and not self.ip: self.ip = '127.0.0.1' if self.master_addr is None: self.master_addr = '%s:%s' % (self.ip, main_conf.master.port) self.worker_addr = '%s:%s' % (self.ip, main_conf.worker.port) self.ips = ips if ips is not None else [] if not self.ips: self.ips.append(self.ip) self.addrs = [self.fix_addr(_ip) for _ip in self.ips] self.manager = ContextManager() self.manager.start(manager_init) self.env = self.manager.dict({ 'ip': self.ip, 'root': self.working_dir, 'is_local': self.is_local_mode, 'master_ip': self.master_ip, 'job_desc': {} }) self.logger = get_logger('cola_context') self.master_rpc_server = None self.worker_rpc_server = None
def init_logger_server(self, logger): self.log_server = LogRecordSocketReceiver(host=get_ip(), logger=logger) threading.Thread(target=self.log_server.serve_forever).start()