def test_001_stop_etcd_daemon(self): """ Test to stop etcd through daemon :return: None """ cls = self.__class__ proxy = EtcdProxy("http://localhost:2005", get_logger("Test", None, None)) os.kill(cls.etcd1.pid, signal.SIGINT) cls.etcd1.join() time.sleep(10) self.assertEqual( proxy.add_and_get_members(None, None), {"etcd2": "http://localhost:2002", "etcd3": "http://localhost:2004"} ) os.kill(cls.etcd2.pid, signal.SIGINT) cls.etcd2.join() time.sleep(10) self.assertEqual( proxy.add_and_get_members(None, None), {"etcd3": "http://localhost:2004"} ) return
def test_000_run_etcd_daemon(self): """ Test to run etcd through etcd daemon :return: None """ cls = self.__class__ # Generate all daemon cls.etcd1 = multiprocessing.Process(target=run, args=("Etcd1", "etcd1/conf.json")) cls.etcd1.daemon = True cls.etcd1.start() time.sleep(2) cls.etcd4 = multiprocessing.Process(target=run, args=("Etcd4", "etcd4/conf.json")) cls.etcd4.daemon = True cls.etcd4.start() time.sleep(2) cls.etcd2 = multiprocessing.Process(target=run, args=("Etcd2", "etcd2/conf.json")) cls.etcd2.daemon = True cls.etcd2.start() cls.etcd3 = multiprocessing.Process(target=run, args=("Etcd3", "etcd3/conf.json")) cls.etcd3.daemon = True cls.etcd3.start() time.sleep(20) # Test if every etcd has joined the cluster proxy = EtcdProxy("http://localhost:2001", get_logger("Test", None, None)) self.assertEqual( proxy.add_and_get_members(None, None), {"etcd1": "http://localhost:2000", "etcd2": "http://localhost:2002", "etcd3": "http://localhost:2004"} ) proxy.set("key", "value") self.assertEqual(proxy.get("key"), "value") return
def test_000_run_mongodb_daemon(self): """ Test to run mongodb through mongodb daemon :return: None """ cls = self.__class__ time.sleep(10) # Generate all daemon cls.mongodb1 = multiprocessing.Process(target=run, args=("Mongodb1", "etcd/conf.json", "mongodb1/conf.json")) cls.mongodb1.daemon = True cls.mongodb1.start() cls.mongodb2 = multiprocessing.Process(target=run, args=("Mongodb2", "etcd/conf.json", "mongodb2/conf.json")) cls.mongodb2.daemon = True cls.mongodb2.start() cls.mongodb3 = multiprocessing.Process(target=run, args=("Mongodb3", "etcd/conf.json", "mongodb3/conf.json")) cls.mongodb3.daemon = True cls.mongodb3.start() time.sleep(20) # Test if every mongodb has joined the replica set cls.etcd = EtcdProxy("http://localhost:2001", get_logger("Test", None, None)) registered_member = list(cls.etcd.get("mongodb/register/").values()) cls.mongo1 = pymongo.MongoClient("localhost", 3000) cls.mongo1_rs = pymongo.MongoClient("localhost", 3000, replicaSet="rs") conf = cls.mongo1.admin.command("replSetGetConfig", 1)["config"]["members"] self.assertEqual(set([x["host"] for x in conf]), set(registered_member)) cls.mongo2 = pymongo.MongoClient("localhost", 3001) cls.mongo2_rs = pymongo.MongoClient("localhost", 3001, replicaSet="rs") cls.mongo3 = pymongo.MongoClient("localhost", 3002) cls.mongo3_rs = pymongo.MongoClient("localhost", 3002, replicaSet="rs") cls.res = cls.mongo2_rs.client["test_db"][ "test_collection"].insert_one({"value": 0}) self.assertEqual( cls.mongo1_rs.client["test_db"]["test_collection"].find_one( {"_id": cls.res.inserted_id})["value"], 0) self.assertEqual( cls.mongo3_rs.client["test_db"]["test_collection"].find_one( {"_id": cls.res.inserted_id})["value"], 0) return
def __init__(self, module_name="Gateway", etcd_conf_path="config/etcd.json", uwsgi_conf_path="config/uwsgi.json"): """ Initialize the object with logger and configuration :param module_name: :param etcd_conf_path: :param uwsgi_conf_path: """ # Load configuration with open(uwsgi_conf_path, "r") as f: self.conf = json.load(f)["server"] self.module_name = module_name super().__init__(__name__, template_folder=self.conf["template"]) # Generate a logger if "log" in self.conf: get_logger(self.logger, self.conf["log"]["info"], self.conf["log"]["error"]) else: get_logger(self.logger, None, None) self.logger.info("%s server program started." % self.module_name) # Set flask configuration self.jinja_env.variable_start_string = "[[" self.jinja_env.variable_end_string = "]]" self.config["MAX_CONTENT_LENGTH"] = TASK_DICTIONARY_MAX_SIZE # Generate proxy for etcd with open(etcd_conf_path, "r") as f: self.local_etcd = generate_local_etcd_proxy( json.load(f)["etcd"], self.logger) self.load_response_function() return
def setUpClass(cls): """ Initialization function :return: None """ print("Initializing environment.\n") # Generate temp data dir for d in ["etcd", "mongodb1", "mongodb2", "mongodb3"]: os.mkdir(d) # Generate etcd etcd_conf = { "exe": "etcd", "name": "etcd", "data_dir": "etcd", "listen": { "address": "0.0.0.0", "peer_port": "2000", "client_port": "2001" }, "advertise": { "address": "localhost", "peer_port": "2000", "client_port": "2001" }, "cluster": { "type": "init" } } cls.etcd_proc = subprocess.Popen( etcd_generate_run_command(etcd_conf), stdout=sys.stdout, stderr=subprocess.STDOUT ) # Generate mongodb configuration cls.mongodb1_conf = { "exe": "mongod", "name": "mongodb1", "data_dir": "mongodb1", "listen": { "address": "0.0.0.0", "port": "3000" }, "advertise": { "address": "localhost", "port": "3000" }, "replica_set": "rs" } cls.mongodb2_conf = { "exe": "mongod", "name": "mongodb2", "data_dir": "mongodb2", "listen": { "address": "0.0.0.0", "port": "3001" }, "advertise": { "address": "localhost", "port": "3001" }, "replica_set": "rs" } cls.mongodb3_conf = { "exe": "mongod", "name": "mongodb3", "data_dir": "mongodb3", "listen": { "address": "0.0.0.0", "port": "3002" }, "advertise": { "address": "localhost", "port": "3002" }, "replica_set": "rs" } cls.init_key = "mongodb/init" cls.reg_path = "mongodb/register/" cls.mongodb1_proc = None cls.mongodb2_proc = None cls.mongodb3_proc = None # Generate a logger and etcd proxy cls.logger = get_logger("Test", None, None) cls.etcd = EtcdProxy("http://localhost:2001", cls.logger) return
def run(service_list, module_name, module_description, conf_path="config/templates/boot.json"): """ Parse args and start services with routine checks :param service_list: List of all services in the starting order :param module_name: The name of the running module :param module_description: The description of the module :param conf_path: Path to boot config :return: None """ # Get a parser and add common args parser = argparse.ArgumentParser(description=module_description) parser.add_argument("--docker-sock", dest="docker_sock", default=None, help="Path to mapped docker sock file") parser.add_argument("--retry-times", type=int, dest="retry_times", default=None, help="Total retry time of key operations") parser.add_argument("--retry-interval", type=int, dest="retry_interval", default=None, help="Interval between retries of key operations") parser.add_argument("--boot-check-interval", type=int, dest="boot_check_interval", default=None, help="Interval between services check in boot module") parser.add_argument("--boot-print-log", dest="boot_print_log", action="store_const", const=True, default=False, help="Print the log of boot module to stdout") # Add args for services for s in service_list: s["generator"] = s["args_parser"](parser, *s.get("args", tuple()), **s.get("kwargs", {})) # Parse args args = parser.parse_args() # Load configuration with open(conf_path, "r") as f: config = json_comment.load(f) if args.boot_check_interval is not None: config["check_interval"] = args.boot_check_interval if args.boot_print_log: config.pop("log", None) # Generate logger if "log" in config: logger = get_logger("boot", config["log"]["info"], config["log"]["error"]) else: logger = get_logger("boot", None, None) logger.info("%s boot program started." % module_name) # If docker-sock is given, generate a docker client for it client = docker.APIClient(base_url=args.docker_sock) if args.docker_sock else None # Dictionary for services services = {} # Start order of services start_order = [] # Load and modify config for all services by calling config generator for s in service_list: # Load config template with open(s["config_template"], "r") as f: config_sub = json_comment.load(f) # Modify services config s["generator"](args, config_sub, client, services, start_order) # Write config with open(s["config"], "w") as f: f.write(json.dumps(config_sub, indent=4)) logger.info("Service %s configuration loaded." % s["name"]) # Check whether service daemons are running regularly try: while True: for s in start_order: logger.info("Checking status of service %s." % s) # Try to open the pid file of the service try: # Lock the pid file with filelock.FileLock(services[s]["pid_file"] + ".lock", timeout=0): logger.debug("Locked pid file %s." % services[s]["pid_file"]) # If the process is None or the process has exited, (re)start it and rewrite the pid file if (not services[s]["process"]) or services[s]["process"].poll() is not None: logger.warning("Service %s is down." % s) # Start the service and write the pid file with open(services[s]["pid_file"], "w") as f: logger.info("Starting the service %s and writing pid file." % s) services[s]["process"] = subprocess.Popen(services[s]["command"]) f.write(str(services[s]["process"].pid)) else: logger.info("Service %s is running." % s) logger.debug("Unlocked pid file %s." % services[s]["pid_file"]) except KeyboardInterrupt: raise except filelock.Timeout: logger.warning("Failed to obtain lock for service %s. Skipping check." % s, exc_info=True) except: logger.error("Failed to check status for service %s." % s, exc_info=True) time.sleep(config["check_interval"]) except KeyboardInterrupt: logger.info("Received SIGINT. Stopping service check.", exc_info=True) # Clean all services for s in start_order[: : -1]: if services[s]["process"]: os.kill(services[s]["process"].pid, signal.SIGINT) logger.info("Killing service %s." % s) services[s]["process"].wait() logger.info("Killed Service %s." % s) logger.info("%s boot program exiting." % module_name) return
def run(module_name="Judicator", etcd_conf_path="config/etcd.json", mongodb_conf_path="config/mongodb.json"): """ Load config and run mongodb :param module_name: Name of the caller module :param etcd_conf_path: Path to etcd config file :param mongodb_conf_path: Path to mongodb config file :return: None """ global working # Load configuration with open(mongodb_conf_path, "r") as f: config = json.load(f) retry_times = config["daemon"]["retry"]["times"] retry_interval = config["daemon"]["retry"]["interval"] # Generate logger for daemon if "log_daemon" in config["daemon"]: daemon_logger = get_logger("mongodb_daemon", config["daemon"]["log_daemon"]["info"], config["daemon"]["log_daemon"]["error"]) else: daemon_logger = get_logger("mongodb_daemon", None, None) daemon_logger.info("%s mongodb_daemon program started." % module_name) # Generate logger for mongodb forwarding raw log output to designated place if "log_mongodb" in config["daemon"]: mongodb_logger = get_logger("mongodb", config["daemon"]["log_mongodb"]["info"], config["daemon"]["log_mongodb"]["error"], True) else: mongodb_logger = get_logger("mongodb", None, None, True) # Get a etcd proxy for replica set operations with open(etcd_conf_path, "r") as f: local_etcd = generate_local_etcd_proxy( json.load(f)["etcd"], daemon_logger) # Get a local mongodb proxy local_mongodb = generate_local_mongodb_proxy(config["mongodb"], local_etcd, daemon_logger) # Check whether the data dir of mongodb is empty # If not, delete it and create a new one if not check_empty_dir(config["mongodb"]["data_dir"]): shutil.rmtree(config["mongodb"]["data_dir"]) os.mkdir(config["mongodb"]["data_dir"]) daemon_logger.info( "Previous data directory deleted with a new one created.") # Check whether the init data dir of mongodb is empty # If not, copy it to the data dir if not check_empty_dir(config["mongodb"]["data_init_dir"]): shutil.rmtree(config["mongodb"]["data_dir"]) shutil.copytree(config["mongodb"]["data_init_dir"], config["mongodb"]["data_dir"]) daemon_logger.info("Found existing data initialize directory.") # Generate command and run mongodb instance as a subprocess command = mongodb_generate_run_command(config["mongodb"]) for c in command: daemon_logger.info("Starting mongodb with command: " + c) mongodb_proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # Create and start register thread register_thread = threading.Thread(target=register, args=(config, local_etcd, local_mongodb, mongodb_proc, daemon_logger)) register_thread.setDaemon(True) register_thread.start() # Log the output of mongodb instance until EOF or terminated try: log_output(mongodb_logger, mongodb_proc.stdout, config["daemon"]["raw_log_symbol_pos"]) daemon_logger.info("Received EOF from mongodb.") except KeyboardInterrupt: daemon_logger.info("Received SIGINT. Cleaning up and exiting.", exc_info=True) # Stop the register thread working = False register_thread.join() # Kill the process if not local_mongodb.shutdown_and_close(): daemon_logger.error("Killing the process.") mongodb_proc.terminate() mongodb_proc.wait() daemon_logger.info("Mongodb process exited.") # Register this mongodb as exited one try_with_times(retry_times, retry_interval, False, daemon_logger, "cancel registration of mongodb", local_mongodb.cancel_registration, config["daemon"]["etcd_path"]["register"]) daemon_logger.info("Removed local mongodb registration on etcd.") except: daemon_logger.error( "Accidentally terminated. Killing mongodb process.", exc_info=True) mongodb_proc.terminate() # Wait until mongodb process exit mongodb_proc.wait() working = False daemon_logger.info("%s mongodb_daemon program exiting." % module_name) return
def run(module_name, etcd_conf_path="config/etcd.json"): """ Load config and run etcd :param module_name: Name of the caller module :param etcd_conf_path: Path to etcd config file :return: None """ # Load config file with open(etcd_conf_path, "r") as f: config = json.load(f) retry_times = config["daemon"]["retry"]["times"] retry_interval = config["daemon"]["retry"]["interval"] # Generate daemon logger from config file # If logger arguments exist in config file, write the log to the designated file # Else, forward the log to standard output if "log_daemon" in config["daemon"]: daemon_logger = get_logger( "etcd_daemon", config["daemon"]["log_daemon"]["info"], config["daemon"]["log_daemon"]["error"] ) else: daemon_logger = get_logger("etcd_daemon", None, None) daemon_logger.info("%s etcd_daemon program started." % module_name) # Generate etcd logger forwarding etcd log to designated location if "log_etcd" in config["daemon"]: etcd_logger = get_logger( "etcd", config["daemon"]["log_etcd"]["info"], config["daemon"]["log_etcd"]["error"], True ) else: etcd_logger = get_logger("etcd", None, None, True) # Generate a etcd proxy for local etcd local_etcd = generate_local_etcd_proxy(config["etcd"], daemon_logger) # Check whether the data dir of etcd is empty # If not, delete it and create a new one if not check_empty_dir(config["etcd"]["data_dir"]): shutil.rmtree(config["etcd"]["data_dir"]) os.mkdir(config["etcd"]["data_dir"]) daemon_logger.info("Previous data directory deleted with a new one created.") # Check whether the data dir of etcd is empty # If not, copy it to data dir, and the cluster initialization information should be skipped if "data_init_dir" in config["etcd"] and not check_empty_dir(config["etcd"]["data_init_dir"]): del config["etcd"]["cluster"] shutil.rmtree(config["etcd"]["data_dir"]) shutil.copytree(config["etcd"]["data_init_dir"], config["etcd"]["data_dir"]) daemon_logger.info("Found existing data initialize directory. Skipped cluster parameters.") # If cluster config exists, this node is going to either explicitly join a cluster or initialize one if "cluster" in config["etcd"]: # If service name of etcd member detection is specified, use it and find a member client if "service" in config["etcd"]["cluster"]: daemon_logger.info("Searching etcd member using docker service and dns.") # Get all running tasks of the specified service services = [] try: client = docker.APIClient(base_url=config["daemon"]["docker_sock"]) services = [ "http://" + config["etcd"]["cluster"]["service"] + "." + str(x["Slot"]) + "." + x["ID"] + ":" + config["etcd"]["cluster"]["client_port"] for x in sorted( client.tasks({"service": config["etcd"]["cluster"]["service"]}), key=lambda x: x["CreatedAt"] ) if x["DesiredState"] == "running" ] except: daemon_logger.error("Failed to connect to docker engine.", exc_info=True) daemon_logger.error("%s etcd_daemon program exiting." % module_name) exit(1) daemon_logger.debug("Found following members with service name %s:" % config["etcd"]["cluster"]["service"]) for x in services: daemon_logger.debug("%s" % x) # When local etcd is not in proxy mode, try to delete current task from service list, if it is found # Else, exit as it should never happen if "proxy" not in config["etcd"]: try: services.remove( "http://" + config["etcd"]["advertise"]["address"] + ":" + config["etcd"]["advertise"]["client_port"] ) except: daemon_logger.error("Failed to find current docker task in list.", exc_info=True) daemon_logger.error("%s etcd_daemon program exiting." % module_name) exit(1) else: daemon_logger.info("Detected proxy mode. Skipped removing current docker task from service list.") # If one or more docker service tasks are running, join the first created one # Else, initialize the cluster by itself if it is not in proxy mode, or exit if services: daemon_logger.info("Found following available members:") for x in services: daemon_logger.info("%s" % x) config["etcd"]["cluster"] = {"type": "join", "client": services[0]} daemon_logger.info("Selected %s as member client." % config["etcd"]["cluster"]["client"]) else: daemon_logger.warning("No available member detected.") if "proxy" not in config["etcd"]: config["etcd"]["cluster"] = {"type": "init"} daemon_logger.info("Initializing cluster by local etcd.") else: daemon_logger.info("%s etcd_daemon program exiting." % module_name) exit(1) # If the node is going join a cluster without knowing existing members # Add itself to the cluster if not in proxy mode # And then (in or not in proxy mode), get all member information for etcd start up command if config["etcd"]["cluster"]["type"] == "join" and not "member" in config["etcd"]["cluster"]: # Generate a etcd proxy for the remote etcd to be joined remote_etcd = EtcdProxy(config["etcd"]["cluster"]["client"], daemon_logger) # Join the cluster by adding self information success, res = try_with_times( retry_times, retry_interval, False, daemon_logger, "get (and add) member to etcd cluster status", remote_etcd.add_and_get_members, config["etcd"]["name"], "http://" + config["etcd"]["advertise"]["address"] + ":" + config["etcd"]["advertise"]["peer_port"], "proxy" in config["etcd"] ) if not success: daemon_logger.error("Failed to get (and add) member information to remote client. Exiting.") exit(1) # Generate member argument for the joining command config["etcd"]["cluster"]["member"] = ",".join([(k + "=" + v) for k, v in res.items()]) daemon_logger.info("Existing members of cluster received.") daemon_logger.info("Etcd will be started with member arguments: %s." % config["etcd"]["cluster"]["member"]) # Generate running command command = etcd_generate_run_command(config["etcd"]) for c in command: daemon_logger.info("Starting etcd with command: " + c) # Run etcd in a subprocess. etcd_proc = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) # Create a check thread to check if etcd has been started up check_thread = threading.Thread( target=check, args=(retry_times, retry_interval, etcd_proc, local_etcd, daemon_logger) ) check_thread.setDaemon(True) check_thread.start() # Log the raw output of etcd until it exit or terminated try: log_output(etcd_logger, etcd_proc.stdout, config["daemon"]["raw_log_symbol_pos"]) daemon_logger.info("Received EOF from etcd.") except KeyboardInterrupt: daemon_logger.info("Received SIGINT. Cleaning up and exiting.", exc_info=True) if "proxy" not in config["etcd"]: try_with_times( retry_times, retry_interval, False, daemon_logger, "remove etcd from cluster", local_etcd.remove_member, config["etcd"]["name"], "http://" + config["etcd"]["advertise"]["address"] + ":" + config["etcd"]["advertise"]["peer_port"] ) else: daemon_logger.info("Detected proxy mode. Skipped removing local etcd from cluster.") os.kill(etcd_proc.pid, signal.SIGINT) except: daemon_logger.error("Accidentally terminated. Killing etcd process.", exc_info=True) etcd_proc.terminate() # Wait for the subprocess etcd_proc.wait() daemon_logger.info("%s etcd_daemon program exiting." % module_name) return
def run(module_name="Judicator", etcd_conf_path="config/etcd.json", mongodb_conf_path="config/mongodb.json", main_conf_path="config/main.json"): """ Load config and run judicator main program :param module_name: Name of the caller module :param etcd_conf_path: Path to etcd config file :param mongodb_conf_path: Path to mongodb config file :param main_conf_path: Path to main config file :return: None """ global working # Load configuration with open(main_conf_path, "r") as f: config = json.load(f) # Generate logger if "log" in config: logger = get_logger("main", config["log"]["info"], config["log"]["error"]) else: logger = get_logger("main", None, None) logger.info("%s main program started." % module_name) # Generate proxy for etcd and mongodb with open(etcd_conf_path, "r") as f: local_etcd = generate_local_etcd_proxy(json.load(f)["etcd"], logger) with open(mongodb_conf_path, "r") as f: local_mongodb = generate_local_mongodb_proxy( json.load(f)["mongodb"], local_etcd, logger) # Get a connection to both task and executor collection in mongodb mongodb_task = local_mongodb.client[config["task"]["database"]][ config["task"]["collection"]] mongodb_executor = local_mongodb.client[config["executor"]["database"]][ config["executor"]["collection"]] # Create and start the register thread register_thread = threading.Thread(target=register, args=(config, local_etcd, local_mongodb, logger)) register_thread.setDaemon(True) register_thread.start() # Create and start the lead_thread = threading.Thread(target=lead, args=(config, local_etcd, local_mongodb, mongodb_task, mongodb_executor, logger)) lead_thread.setDaemon(True) lead_thread.start() # Start the rpc server and serve until terminated server = TServer.TThreadedServer( Judicator.Processor(RPCService(logger, mongodb_task, mongodb_executor)), TSocket.TServerSocket(config["listen"]["address"], int(config["listen"]["port"])), TTransport.TBufferedTransportFactory(), TBinaryProtocol.TBinaryProtocolFactory()) try: logger.info("Starting rpc server.") server.serve() except KeyboardInterrupt: logger.info( "Received SIGINT. Cancelling judicator registration on etcd.", exc_info=True) # Wait for the register thread to delete registration and then stop working = False register_thread.join() except: logger.error("Accidentally terminated.", exc_info=True) working = False logger.info("%s main program exiting." % module_name) return
def run(module_name="Executor", etcd_conf_path="config/etcd.json", main_conf_path="config/main.json"): """ Load config and run executor main program :param module_name: Name of the caller module :param etcd_conf_path: Path to etcd config file :param main_conf_path: Path to main config file :return: None """ global tasks, lock # Load configuration with open(main_conf_path, "r") as f: config = json.load(f) retry_times = config["retry"]["times"] retry_interval = config["retry"]["interval"] # Generate logger if "log" in config: logger = get_logger("main", config["log"]["info"], config["log"]["error"]) else: logger = get_logger("main", None, None) logger.info("%s main program started." % module_name) # Generate proxy for local etcd with open(etcd_conf_path, "r") as f: local_etcd = generate_local_etcd_proxy(json.load(f)["etcd"], logger) # Check whether the data dir of main is empty # If not, delete it and create a new one if not check_empty_dir(config["data_dir"]): shutil.rmtree(config["data_dir"]) os.mkdir(config["data_dir"]) logger.info("Previous data directory deleted with a new one created.") os.chmod(config["data_dir"], 0o700) logger.info("Data directory privilege changed to 0700.") # If task user id and group id is not specified, use the current user and group if "user" not in config["task"]: config["task"]["user"] = {"uid": os.getuid(), "gid": os.getgid()} logger.info("Task execution uid: %d, gid: %d." % (config["task"]["user"]["uid"], config["task"]["user"]["gid"])) # Report tasks execution status regularly logger.info("Starting executor routines.") try: while True: time.sleep(config["report_interval"]) # Collect things to report logger.info("Collecting report content.") complete, executing = [], [] vacant = config["task"]["vacant"] # Acquire lock first before modifying global variable with lock: try: for t in tasks: if not tasks[t]["cancel"]: if tasks[t]["thread"] and not tasks[t][ "thread"].is_alive(): complete.append( generate(tasks[t], False, False, False)) logger.info("Task %s added to complete list." % t) else: executing.append(generate(tasks[t], True)) vacant -= 1 logger.info( "Task %s added to executing list." % t) except KeyboardInterrupt: raise except: logger.error("Failed to collect report content.", exc_info=True) # Try to report to judicator and get response logger.info("Executor current vacancy: %d." % vacant) logger.info("Reporting to judicator.") success, res = try_with_times(retry_times, retry_interval, False, logger, "report to judicator", report, complete, executing, vacant, local_etcd, config, logger) if not success: logger.error( "Failed to report to judicator. Skipping tasks update.") continue cancel, assign = [extract(x, brief=True) for x in res[0]], [extract(x) for x in res[1]] logger.info("Reported to judicator with response:") logger.info("Cancel list: %s." % str([t["id"] for t in cancel])) logger.info("Assign list: %s." % str([t["id"] for t in assign])) # Update tasks information logger.info("Updating tasks information.") # Acquire lock first before modifying global variable with lock: try: # Cancel tasks logger.info("Checking tasks to be cancelled.") for t in cancel: logger.info("Cancelling task %s." % t["id"]) if not t["id"] in tasks: continue tasks[t["id"]]["cancel"] = True # If the subprocess is still running, kill it if tasks[t["id"]]["process"] and tasks[ t["id"]]["process"].poll() is None: logger.info("Killing subprocess of task %s." % t["id"]) tasks[t["id"]]["process"].kill() else: logger.info("No subprocess to kill for task %s." % t["id"]) # Clean all tasks # A list ot tasks index must be built beforehand, as the tasks is going to change tasks_list = tuple(tasks.keys()) logger.info("Checking tasks to be deleted.") for t in tasks_list: if tasks[t]["thread"] and not tasks[t][ "thread"].is_alive(): # A task is can only be considered as all done (thus can be deleted) # when thread.is_alive() is False (indicating the daemon thread has finished) # and cancel (indicating the judicator has received the result) is True if tasks[t]["cancel"]: del tasks[t] logger.info("Deleted task %s." % t) # Handle newly assigned logger.info("Checking tasks to be assigned.") for t in assign: logger.info("Assigned task %s." % t["id"]) tasks[t["id"]] = t tasks[t["id"]]["process"] = None tasks[t["id"]]["cancel"] = False # Generate a thread and start it tasks[t["id"]]["thread"] = threading.Thread( target=execute, args=(t["id"], config, logger)) tasks[t["id"]]["thread"].setDaemon(True) tasks[t["id"]]["thread"].start() except KeyboardInterrupt: raise except: logger.error("Failed to update tasks.", exc_info=True) logger.info("Finished executor routine work.") except KeyboardInterrupt: logger.info("Received SIGINT. Cleaning up all subprocess.", exc_info=True) # Clean up and kill all subprocess # Acquire the lock until exit to ensure that no more subprocess are generated lock.acquire() for t in tasks: if tasks[t]["process"]: tasks[t]["process"].kill() tasks[t]["process"].wait() logger.info("Killed subprocess of task %s." % t) else: logger.info("Task %s has no subprocess running." % t) logger.info("%s main program exiting." % module_name) return