def _list_and_watch_ensembles(tr : fdb.Transaction, dir, changes): ensembles = [] for k, v in tr[dir.range()]: ensemble, = dir.unpack(k) ensembles.append(ensemble) return ensembles, tr.watch(changes.key())
def should_run_ensemble(tr: fdb.Transaction, ensemble_id: str) -> bool: """ Return True if this agent should start a run for this ensemble. The policy tries not to overshoot max_runs by too much, but also accounts for the possibility that agents might die. """ props = _get_ensemble_properties(tr, ensemble_id) started = props.get("started", 0) max_runs = props.get("max_runs", 0) # max_runs == 0 means run forever if max_runs > 0 and started >= max_runs: current_time = time.time() max_seed = None max_heartbeat_age = None for seed, heartbeat in _get_seeds_and_heartbeats(ensemble_id, tr): assert type(seed) == int if max_seed is None or current_time - heartbeat > max_heartbeat_age: max_seed = seed max_heartbeat_age = current_time - heartbeat if max_heartbeat_age is None: # No other agents are running a test for this ensemble (is this possible?) return True if max_heartbeat_age > 10: print( "Agent {} presumed dead. Attempting to steal its work.".format( _get_hostname(ensemble_id, max_seed, tr))) # If we read at snapshot isolation then an arbitrary number of agents could steal this run/seed. # We only want one agent to succeed in taking over for the dead agent's run/seed. tr.add_read_conflict_key( dir_ensemble_incomplete[ensemble_id]["heartbeat"][max_seed]) del tr[dir_ensemble_incomplete[ensemble_id][max_seed]] del tr[dir_ensemble_incomplete[ensemble_id][max_seed].range()] del tr[dir_ensemble_incomplete[ensemble_id]["heartbeat"][max_seed]] return True return False else: # max_runs == 0 or started < max_runs return True
def _add(tr : fdb.Transaction, ensemble_id : str, counter : str, value : int) -> None: byte_val = struct.pack("<Q", value) tr.add(dir_all_ensembles[ensemble_id]['count'][counter], byte_val)
def _increment(tr : fdb.Transaction, ensemble_id : str, counter : str) -> None: tr.add(dir_all_ensembles[ensemble_id]['count'][counter], ONE)