def store_results(self, measure: "Measure", player_name: Optional[str], results: Dict) -> None: key = measure.store_as # initialize the store_as object if we need to if key not in self.stored_data: self.stored_data[key] = {} if player_name: # i.e. not store_singleton if (player_name in self.stored_data[key] and results != self.stored_data[key][player_name]): msg = ( f"Overwriting stored data key {key}[{player_name}]. Previous" f" value:\n{self.stored_data[key][player_name]}") logger.info(gudlog(msg, self)) self.stored_data[key][player_name] = results msg = f"Stored {key}[{player_name}]:\n{results}" logger.debug(gudlog(msg, self)) else: if self.stored_data[key] and results != self.stored_data[key]: msg = (f"Overwriting stored data key {key}. Previous" f" value:\n{self.stored_data[key]}") logger.info(gudlog(msg, self)) self.stored_data[key] = results msg = f"Stored {key}:\n{results}" logger.debug(gudlog(msg, self))
async def play(self) -> None: # add local measures for md in conductor_local_measure_dicts: if md["name"] not in [x.name for x in self.measures ]: # make sure it's not already there local_measure: "Measure" = MeasureSchema().load(md) local_measure.local_measure = True self.measures.insert(0, local_measure) # add local measure dependencies local_measure_names = [ x["name"] for x in conductor_local_measure_dicts ] for m in self.measures: if m.name in local_measure_names: continue if "tune_orchestra" not in m.depends_on and m.name != "tune_orchestra": msg = "adding 'tune_orchestra' dependency" logger.debug(gudlog(msg, self, None, m.name)) m.depends_on.insert(0, "tune_orchestra") # add conductor allhosts player if config.CONDUCTOR_ALLHOSTS_PLAYER_NAME not in self.players.keys(): local_player: "Player" = PlayerSchema().load( {"name": config.CONDUCTOR_ALLHOSTS_PLAYER_NAME}) self.players[config.CONDUCTOR_ALLHOSTS_PLAYER_NAME] = local_player # conductor player will operate over all hosts, including those not created yet all_hosts, err_msgs = self.get_all_hosts() for err_msg in err_msgs: logger.warning(gudlog(err_msg, self)) self.players[config.CONDUCTOR_ALLHOSTS_PLAYER_NAME].hostnames = [ h.name for h in all_hosts ] # make johann tarball in prep for tuning create_johann_tarball() self.state = TaskState.STARTED self.started_at = datetime.utcnow() # self.started_at = datetime.now(tz=pytz.utc) unqueued_measures = copy.copy(self.measures) while True: self.evaluate_state() if self.finished: break self.smart_queue(unqueued_measures) await asyncio.sleep(1)
def copy_from(self, player: "Player", score: "Score") -> Tuple[Optional[bool], Optional[str]]: """ Args: player: score: Returns: None if no changes True if changed successfully False if player is not a Player object """ if not isinstance(player, Player): return False, "not a Player object" if player.name != self.name: logger.warning( "copy_from() called with a player with a different name...was this" " intentional?") self.name = player.name changed = False if player.hostnames != self.hostnames: changed = True msg = "updating hostnames from {} to {}".format( self.hostnames, player.hostnames) logger.debug(gudlog(msg, score, self)) self.hostnames = player.hostnames if player.scale != self.scale: changed = True msg = "updating scale from {} to {}".format( self.scale, player.scale) logger.debug(gudlog(msg, score, self)) self.scale = player.scale if player.image != self.image: changed = True msg = "updating image from {} to {}".format( self.image, player.image) logger.debug(gudlog(msg, score, self)) self.image = player.image if not changed: return None, None else: return True, None
def evaluate_state(self, score: "Score") -> None: task_status = self.get_task_status(short=False) prior_state = self.state success = 0 finished = 0 for player_name, tstat in task_status.items(): if tstat["finished"]: finished += 1 if tstat["state"] == TaskState.SUCCESS: success += 1 if task_state_priority(tstat["state"]) > task_state_priority(self.state): self.state = tstat["state"] if tstat["status"]: self.status[player_name] = tstat["status"] if self.state == TaskState.FAILURE and finished == len(task_status): self.finished = True if success == len(self.player_names): self.state = TaskState.SUCCESS if finished == len(self.player_names): self.finished = True if self.state == TaskState.FAILURE and self.state != prior_state: msg = f"measure '{self.name}' failed:\n{json.dumps(self.status, indent=2)}" logger.warning(gudlog(msg, score)) self.store_results(score, task_status)
def enqueue( self, score: "Score", measure: "Measure", func: str, delay: int, *args: Any) -> Tuple[bool, Optional[str], Optional["GroupResult"]]: signatures = [] for hostname in self.hostnames: if hostname not in hosts: msg = "{} not found in dictionary of hosts".format(hostname) logger.warning(gudlog(msg, score, self, measure.name)) return False, msg, None host = hosts[hostname] if measure.local_measure: sig = Player.get_local_task_signature(score.name, measure.name, host, func, delay, *args) else: if host.pending_create: msg = "{} still pending creation".format(host.name) logger.warning(gudlog(msg, score, self, measure.name)) return False, msg, None sig = host.get_task_signature(score.name, self.name, measure.name, func, delay, *args) host.clear_finished_celery_task_ids() host.celery_task_ids.append(sig.id) if sig is None: msg = "task signature creation failed for hostname {}".format( host.name) logger.warning(gudlog(msg, score, self, measure.name)) return False, msg, None score.task_map[sig.id] = { "measure_name": measure.name, "player_name": self.name, "host_name": host.name, } signatures.append(sig) task_group = group(signatures) group_result = task_group.apply_async() return True, None, group_result
async def play_the_player(self, measure: "Measure", player: "Player") -> bool: delay = measure.start_delay # handle special arguments like random numbers and stored values try: if isinstance(delay, str): delay = transform_arg(self, measure, player, delay) assert isinstance(delay, int) new_args = transform_args(self, measure, player, measure.args) except (AssertionError, KeyError, ValueError): msg = gudexc( "Failed to queue measure -- bad special argument(s)", self, player, measure, ) logger.error(msg) measure.state = TaskState.FAILURE # we can't set measure.finished yet; other players for this measure may be running measure.status["all"]["all"] = f"{msg}; see logs for details" return False msg = f"queueing measure {measure.name} with a delay of {delay} seconds" logger.info(gudlog(msg, self, player)) msg = f"(transformed) args:\n{pprint.pformat(new_args, indent=4)}" logger.debug(gudlog(msg, self, player, measure)) success, err_msg, group_task = player.enqueue(self, measure, measure.task_name, delay, *new_args) if success: measure.celery_group_tasks[player.name] = group_task else: msg = f"failed to play measure {measure.name}: {err_msg}" logger.warning(gudlog(msg, self, player)) measure.state = TaskState.FAILURE measure.status[player.name]["all"] = msg return False
def get_task_signature( self, score_name: str, player_name: str, measure_name: str, func: str, delay: int, *task_args, **task_kwargs, ) -> "Signature": if self.tuning: msg = ( "host is tuning or pending tuning; strongly advise against running new" " tasks on it or they may be interrupted without warning or recovery" ) logger.warning( gudlog(msg, score_name, player_name, None, self.name)) description = f"{score_name}.{player_name}.{self.name}.{measure_name}" task_id = str( uuid4()) # we need to know the task_id a priori for score.task_map sig_opts = { "queue": self.name, "shadow": description, "task_id": task_id, "countdown": delay, } sig = signature(func, args=task_args, kwargs=task_kwargs, options=sig_opts) msg = (f"task signature created for {func}{task_args} with" f" kwargs:\n{task_kwargs}\nand options:\n{sig_opts}") logger.log( 5, gudlog(msg, score_name, player_name, measure_name, self.name)) return sig
def smart_queue(self, unqueued_measures: List["Measure"]) -> Optional["Measure"]: to_queue = ( None # queue one at a time to ensure we catch any dependency failures ) dependency_failed = [] for m in unqueued_measures: # no dependencies if not m.depends_on: to_queue = m break else: # make sure dependencies finished before queueing # also if any dependencies failed, dependent measures fail too ready_to_queue = True # until/unless False below for dep_name in m.depends_on: depm = self.get_measure(dep_name) if not depm.finished: ready_to_queue = False m.state = TaskState.DEFERRED if depm.state == TaskState.FAILURE and not m.dependency_proof: ready_to_queue = False m.state = TaskState.FAILURE m.finished = True m.status["all"][ "all"] = f"dependency failed ({dep_name})" dependency_failed.append(m) if ready_to_queue: to_queue = m break if to_queue is not None: self.queue_measure(to_queue) if to_queue in unqueued_measures: unqueued_measures.remove(to_queue) for to_remove in dependency_failed: msg = ( f"measure '{to_remove.name}' failed because one or more of its" " dependencies failed") logger.warning(gudlog(msg, self)) if to_remove in unqueued_measures: unqueued_measures.remove(to_remove) return to_queue
def store_results( self, score: "Score", task_status: Dict[str, Any] ) -> Optional[Dict[str, Any]]: if not self.store_as: return None results = {} for player_name, tstat in task_status.items(): results_p = {} for t_id, t in tstat["tasks"].items(): t_result = None if "result" in t: t_result = t["result"] elif ( self.store_interim_results and "meta" in t and t["meta"] and "interim_result" in t["meta"] ): t_result = t["meta"]["interim_result"] if t_result is not None: if self.store_singleton: score.store_results(self, None, t_result) if t_id not in score.task_map: msg = ( f"task {t_id} not found in task_map. This is probably bad." ) logger.info(gudlog(msg, score, player_name, self)) results_p[t_id] = t_result else: results_p[score.task_map[t_id]["host_name"]] = t_result results[player_name] = results_p if results_p and not self.store_singleton: score.store_results(self, player_name, results_p) return results
def evaluate_state(self) -> None: # make sure we are current on measure state for m in self.measures: if not m.finished: m.evaluate_state(self) not_success = 0 unfinished = 0 for m in self.measures: if not m.finished: unfinished += 1 if m.state is not TaskState.SUCCESS: not_success += 1 if task_state_priority(m.state) > task_state_priority(self.state): self.state = m.state if m.status: # replace task id with host name if known status_copy = copy.copy(m.status) for player_name, player_status in m.status.items(): new_player_status = {} for task_id, status in player_status.items(): if task_id in self.task_map: tmi = self.task_map[task_id] new_player_status[tmi["host_name"]] = status else: new_player_status[task_id] = status status_copy[player_name] = new_player_status self.status[m.name] = status_copy if not_success == 0: self.state = TaskState.SUCCESS if unfinished == 0: self.finished = True self.finished_at = datetime.utcnow() msg = f"finished with state {self.state}" logger.info(gudlog(msg, self))
def validate_create_host_mappings(self) -> Tuple[bool, List[str]]: success = True err_msgs = [] # map missing (to be created) hostnames to players if self.create_hosts: self.map_missing_hosts(get_host_names()) for p in self.players.values(): # skip local player if p.name == config.CONDUCTOR_ALLHOSTS_PLAYER_NAME: continue # validate hostnames length if p.hostnames == [] and not self.create_hosts: msg = f"{p.name}: no hosts mapped" if p.scale == 0 and config.ALLOW_EMPTY_PLAYER_HOSTS: logger.debug(gudlog(msg, self)) else: success = False logger.warning(gudlog(msg, self)) err_msgs.append(msg) continue elif len(p.hostnames) != p.scale and not self.create_hosts: success = False msg = f"{p.name}: length of hosts does not match scale ({p.scale})" logger.warning(gudlog(msg, self)) err_msgs.append(msg) continue for host_name in p.hostnames: if host_name not in hosts: # create host obj host_dict = { "hostname": host_name, "image": p.image, } try: msg = ( f"Temporarily creating Host object for {host_name} with" f" image {p.image}") logger.debug(gudlog(msg, self, p)) host_obj = HostSchema().load(host_dict) except MarshmallowValidationError: success = False msg = f"{host_name}: error creating Host object" logger.warning(gudlog(msg, self, p)) err_msgs.append(msg) continue elif hosts[host_name].get_image() not in [None, p.image]: success = False msg = ( f"{host_name}: Host object already exists with conflicting" " image") logger.warning(gudlog(msg, self, p)) err_msgs.append(msg) continue elif hosts[host_name].tuning: success = False msg = ( f"host '{host_name}' is tuning or pending tuning; try again" " soon") logger.warning(gudlog(msg, self, p)) err_msgs.append(msg) continue else: host_obj = hosts[host_name] # make sure we have externally-accessible Redis if needed if host_obj.control_method not in config.HOST_CONTROL_EXTERNAL_REDIS: success = False msg = ( f"{host_name}: control_method '{host_obj.control_method}'" " is not properly registered in" " config.HOST_CONTROL_EXTERNAL_REDIS") logger.warning(gudlog(msg, self, p)) err_msgs.append(msg) continue elif (not config.REDIS_HOST_EXTERNAL and config.HOST_CONTROL_EXTERNAL_REDIS[ host_obj.control_method]): success = False msg = ( f"host ('{host_name}') requires externally-accessible Redis," " but environment variable REDIS_HOST_EXTERNAL is not" " specified -- it is usually easiest to include this in the" " conductor's" f" '{config.SRC_ROOT.joinpath(config.ENV_FILE)}'") logger.warning(gudlog(msg, self, p)) err_msgs.append(msg) continue # get HostControl object for this Host if host_obj.control_method.upper() == "DOCKER": host_control_class = DockerHostControl else: host_control_class, msg = get_host_control_class( host_obj.control_method) if not host_control_class: success = False msg = f"{host_name}: {msg}" logger.warning(gudlog(msg, self, p)) err_msgs.append(msg) continue # check if the host was recently confirmed to be turned on host_recently_confirmed_on = False if host_obj.last_confirmed_on: check_age = (datetime.utcnow() - host_obj.last_confirmed_on).seconds if check_age < config.HOST_CONFIRMED_ON_VALID_SECS: host_recently_confirmed_on = True msg = ( f"Host '{host_name}' with control_name" f" '{host_obj.control_name}' confirmed to be on via" f" {host_obj.control_method} {check_age}s ago; not checking" " again") logger.debug(msg) host_confirmed_on = False if not host_recently_confirmed_on: control_name = host_obj.control_name or host_obj.name host_confirmed_on = host_control_class.host_exists( control_name) if host_confirmed_on: host_obj.last_confirmed_on = datetime.utcnow() msg = ( f"Host '{host_name}' with control_name" f" '{host_obj.control_name}' appears to exist via" f" {host_obj.control_method}") logger.debug(msg) if host_recently_confirmed_on or host_confirmed_on: if ( host_name not in hosts ): # we may have created host_obj above and not yet in config.hosts hosts[host_name] = host_obj msg = ( f"Added new Host object for {host_name} with image" f" {p.image}") logger.info(gudlog(msg, self, p)) elif not self.create_hosts: success = False msg = ( f"host '{host_name}' with control_name" f" '{host_obj.control_name}' not found via (possibly default)" f" control method '{host_obj.control_method}'") logger.warning(gudlog(msg, self, p)) err_msgs.append(msg) continue else: # mark hosts that need to be created hosts[host_name].pending_create = True msg = f"host {host_name} marked for creation" logger.debug(gudlog(msg, self, p)) return success, err_msgs