def get_opening_orders(self): """ Returns a dictionary of power_name: [orders] for each power The orders represent the opening orders that would have been submitted by the player """ game = Game() state_proto = extract_state_proto(game) phase_history_proto = extract_phase_history_proto(game) possible_orders_proto = extract_possible_orders_proto(game) # Retrieving all orders # Using default player_seed, noise, temperature, and dropout_rate. # power_orders is a list of tuples (orders, policy_details) power_orders = yield [ self.policy_adapter.get_orders(self.get_orderable_locations( state_proto, power_name), state_proto, power_name, phase_history_proto, possible_orders_proto, retry_on_failure=False) for power_name in game.powers ] return { power_name: orders[0] for power_name, orders in zip(game.powers.keys(), power_orders) }
def get_orders(self, game, power_names, *, retry_on_failure=True, **kwargs): """ Gets the orders the power(s) should play. :param game: The game object :param power_names: A list of power names we are playing, or alternatively a single power name. :param retry_on_failure: Boolean that indicates to retry querying from the model if an error is encountered. :param kwargs: Additional optional kwargs: - player_seed: If set. Override the player_seed to use for the model based player. - noise: If set. Override the noise to use for the model based player. - temperature: If set. Override the temperature to use for the model based player. - dropout_rate: If set. Override the dropout_rate to use for the model based player. - with_draw: If set, also returns whether to accept a draw or not :return: One of the following: 1) If power_name is a string and with_draw == False (or is not set): - A list of orders the power should play 2) If power_name is a list and with_draw == False (or is not set): - A list of list, which contains orders for each power 3) If power_name is a string and with_draw == True: - A tuple of 1) the list of orders for the power, 2) a boolean to accept a draw or not 4) If power_name is a list and with_draw == True: - A list of tuples, each tuple having the list of orders and the draw boolean :type game: diplomacy.Game """ state_proto = extract_state_proto(game) phase_history_proto = extract_phase_history_proto(game) possible_orders_proto = extract_possible_orders_proto(game) # Determining if we have a single or multiple powers if not isinstance(power_names, list): is_single_power = True power_names = [power_names] else: is_single_power = False # Getting orders (and optional draw) orders_with_maybe_draw = yield [ self.get_orders_with_proto(state_proto, power_name, phase_history_proto, possible_orders_proto, retry_on_failure=retry_on_failure, **kwargs) for power_name in power_names ] # Returning a single instance, or a list orders_with_maybe_draw = orders_with_maybe_draw[ 0] if is_single_power else orders_with_maybe_draw return orders_with_maybe_draw
def test_get_feedable_item(self): """ Checks if the .get_feedable_item method works """ game = Game() state_proto = extract_state_proto(game) phase_history_proto = extract_phase_history_proto(game) possible_orders_proto = extract_possible_orders_proto(game) locs = ['PAR', 'MAR', 'BUR'] kwargs = { 'player_seed': 0, 'noise': 0., 'temperature': 0., 'dropout_rate': 0. } assert self.dataset_builder.get_feedable_item(locs, state_proto, 'FRANCE', phase_history_proto, possible_orders_proto, **kwargs)
def get_beam_orders(self, game, power_names, *, retry_on_failure=True, **kwargs): """ Finds all the beams with their probabilities returned by the diverse beam search for the selected power(s) Beams are ordered by score (highest first). :param game: The game object :param power_names: A list of power names we are playing, or alternatively a single power name. :param retry_on_failure: Boolean that indicates to retry querying from the model if an error is encountered. :param kwargs: Additional optional kwargs: - player_seed: The seed to apply to the player to compute a deterministic mask. - noise: The sigma of the additional noise to apply to the intermediate layers (i.e. sigma * epsilon) - temperature: The temperature to apply to the logits. (Default to 0. for deterministic/greedy) - dropout_rate: The amount of dropout to apply to the inputs/outputs of the decoder. :return: 1) If power_names is a string, a tuple of beam orders, and of beam probabilities 2) If power_names is a list, a list of list which contains beam orders and beam probabilities :type game: diplomacy.Game """ state_proto = extract_state_proto(game) phase_history_proto = extract_phase_history_proto(game) possible_orders_proto = extract_possible_orders_proto(game) # Determining if we have a single or multiple powers if not isinstance(power_names, list): is_single_power = True power_names = [power_names] else: is_single_power = False # Getting beam orders beam_orders_probs = yield [ self.get_beam_orders_with_proto(state_proto, power_name, phase_history_proto, possible_orders_proto, retry_on_failure=retry_on_failure, **kwargs) for power_name in power_names ] beam_orders_probs = beam_orders_probs[ 0] if is_single_power else beam_orders_probs return beam_orders_probs
def get_policy_details(self, game, power_names, *, retry_on_failure=True, **kwargs): """ Gets the details of the current policy :param game: The game object :param power_names: A list of power names we are playing, or alternatively a single power name. :param retry_on_failure: Boolean that indicates to retry querying from the model if an error is encountered. :param kwargs: Additional optional kwargs: - player_seed: If set. Override the player_seed to use for the model based player. - noise: If set. Override the noise to use for the model based player. - temperature: If set. Override the temperature to use for the model based player. - dropout_rate: If set. Override the dropout_rate to use for the model based player. :return: 1) If power_names is a string, the policy details ==> {'locs', 'tokens', 'log_probs', 'draw_action', 'draw_prob'} 2) If power_names is a list, a list of policy details, one for each power. :type game: diplomacy.Game """ state_proto = extract_state_proto(game) phase_history_proto = extract_phase_history_proto(game) possible_orders_proto = extract_possible_orders_proto(game) # Determining if we have a single or multiple powers if not isinstance(power_names, list): is_single_power = True power_names = [power_names] else: is_single_power = False # Getting policy details policy_details = yield [ self.get_policy_details_with_proto( state_proto, power_name, phase_history_proto, possible_orders_proto, retry_on_failure=retry_on_failure, **kwargs) for power_name in power_names ] policy_details = policy_details[ 0] if is_single_power else policy_details return policy_details
def get_state_value(self, game, power_names, *, retry_on_failure=True, **kwargs): """ Calculates the player's value of the state of the game for the given power(s) :param game: A game object :param power_names: A list of power names for which we want the value, or alternatively a single power name. :param retry_on_failure: Boolean that indicates to retry querying from the model if an error is encountered. :param kwargs: Additional optional kwargs: - player_seed: If set. Override the player_seed to use for the model based player. - noise: If set. Override the noise to use for the model based player. - temperature: If set. Override the temperature to use for the model based player. - dropout_rate: If set. Override the dropout_rate to use for the model based player. :return: 1) If power_names is a string, a single float representing the value of the state for the power 2) If power_names is a list, a list of floats representing the value for each power. :type game: diplomacy.Game """ state_proto = extract_state_proto(game) phase_history_proto = extract_phase_history_proto(game) possible_order_proto = extract_possible_orders_proto(game) # Determining if we have a single or multiple powers if not isinstance(power_names, list): is_single_power = True power_names = [power_names] else: is_single_power = False # Getting state value state_value = yield [ self.get_state_value_with_proto(state_proto, power_name, phase_history_proto, possible_order_proto, retry_on_failure=retry_on_failure, **kwargs) for power_name in power_names ] state_value = state_value[0] if is_single_power else state_value return state_value
def test_get_draw_prob(self): """ Checks if the .get_draw_prob method works """ game = Game() state_proto = extract_state_proto(game) phase_history_proto = extract_phase_history_proto(game) possible_orders_proto = extract_possible_orders_proto(game) locs = ['PAR', 'MAR', 'BUR'] kwargs = { 'player_seed': 0, 'noise': 0., 'temperature': 1., 'dropout_rate': 0. } # Temperature == 1. # With and without prefetching for use_prefetching in (False, True): if not use_prefetching: _, policy_details = yield self.adapter.get_orders( locs, state_proto, 'FRANCE', phase_history_proto, possible_orders_proto, **kwargs) else: fetches = yield self.adapter.get_orders(locs, state_proto, 'FRANCE', phase_history_proto, possible_orders_proto, prefetch=True, **kwargs) fetches = yield process_fetches_dict(self.queue_dataset, fetches) _, policy_details = yield self.adapter.get_orders( locs, state_proto, 'FRANCE', phase_history_proto, possible_orders_proto, fetches=fetches, **kwargs) assert policy_details['draw_action'] in (True, False) assert 0. < policy_details['draw_prob'] < 1.
def get_opening_orders(self): """ Returns a dictionary of power_name: [orders] for each power The orders represent the opening orders that would have been submitted by the player """ game = Game() state_proto = extract_state_proto(game) phase_history_proto = extract_phase_history_proto(game) possible_orders_proto = extract_possible_orders_proto(game) # Retrieving all orders # Not using kwargs - Using default player_seed, noise, temperature, and dropout_rate. power_orders = yield [ self.get_orders_with_proto(state_proto, power_name, phase_history_proto, possible_orders_proto, retry_on_failure=False) for power_name in game.powers ] return { power_name: orders for power_name, orders in zip(game.powers.keys(), power_orders) }
def generate_trajectory(players, reward_fn, advantage_fn, env_constructor=None, hparams=None, power_assignments=None, set_player_seed=None, initial_state_bytes=None, update_interval=0, update_queue=None, output_format='proto'): """ Generates a single trajectory (Saved Gamed Proto) for RL (self-play) with the power assigments :param players: A list of instantiated players :param reward_fn: The reward function to use to calculate rewards :param advantage_fn: An instance of `.models.self_play.advantages` :param env_constructor: A callable to get the OpenAI gym environment (args: players) :param hparams: A dictionary of hyper parameters with their values :param power_assignments: Optional. The power name we want to play as. (e.g. 'FRANCE') or a list of powers. :param set_player_seed: Boolean that indicates that we want to set the player seed on reset(). :param initial_state_bytes: A `game.State` proto (in bytes format) representing the initial state of the game. :param update_interval: Optional. If set, a partial saved game is put in the update_queue this every seconds. :param update_queue: Optional. If update interval is set, partial games will be put in this queue :param output_format: The output format. One of 'proto', 'bytes', 'zlib' :return: A SavedGameProto representing the game played (with policy details and power assignments) Depending on format, the output might be converted to a byte array, or a compressed byte array. :type players: List[diplomacy_research.players.player.Player] :type reward_fn: diplomacy_research.models.self_play.reward_functions.AbstractRewardFunction :type advantage_fn: diplomacy_research.models.self_play.advantages.base_advantage.BaseAdvantage :type update_queue: multiprocessing.Queue """ # pylint: disable=too-many-arguments assert output_format in ['proto', 'bytes', 'zlib' ], 'Format should be "proto", "bytes", "zlib"' assert len(players) == NB_POWERS # Making sure we use the SavedGame wrapper to record the game if env_constructor: env = env_constructor(players) else: env = default_env_constructor(players, hparams, power_assignments, set_player_seed, initial_state_bytes) wrapped_env = env while not isinstance(wrapped_env, DiplomacyEnv): if isinstance(wrapped_env, SaveGame): break wrapped_env = wrapped_env.env else: env = SaveGame(env) # Detecting if we have a Auto-Draw wrapper has_auto_draw = False wrapped_env = env while not isinstance(wrapped_env, DiplomacyEnv): if isinstance(wrapped_env, AutoDraw): has_auto_draw = True break wrapped_env = wrapped_env.env # Resetting env env.reset() # Timing vars for partial updates time_last_update = time.time() year_last_update = 0 start_phase_ix = 0 current_phase_ix = 0 nb_transitions = 0 # Cache Variables powers = sorted( [power_name for power_name in get_map_powers(env.game.map)]) assigned_powers = env.get_all_powers_name() stored_board_state = OrderedDict() # {phase_name: board_state} stored_prev_orders_state = OrderedDict() # {phase_name: prev_orders_state} stored_possible_orders = OrderedDict() # {phase_name: possible_orders} power_variables = { power_name: { 'orders': [], 'policy_details': [], 'state_values': [], 'rewards': [], 'returns': [], 'last_state_value': 0. } for power_name in powers } new_state_proto = None phase_history_proto = [] map_object = Map(name=env.game.map.name) # Generating while not env.is_done: state_proto = new_state_proto if new_state_proto is not None else extract_state_proto( env.game) possible_orders_proto = extract_possible_orders_proto(env.game) # Computing board_state board_state = proto_to_board_state(state_proto, map_object).flatten().tolist() state_proto.board_state.extend(board_state) # Storing possible orders for this phase current_phase = env.game.get_current_phase() stored_board_state[current_phase] = board_state stored_possible_orders[current_phase] = possible_orders_proto # Getting orders, policy details, and state value tasks = [(player, state_proto, pow_name, phase_history_proto[-NB_PREV_ORDERS_HISTORY:], possible_orders_proto) for player, pow_name in zip(env.players, assigned_powers)] step_args = yield [get_step_args(*args) for args in tasks] # Stepping through env, storing power variables for power_name, (orders, policy_details, state_value) in zip(assigned_powers, step_args): if orders: env.step((power_name, orders)) nb_transitions += 1 if has_auto_draw and policy_details is not None: env.set_draw_prob(power_name, policy_details['draw_prob']) # Processing env.process() current_phase_ix += 1 # Retrieving draw action and saving power variables for power_name, (orders, policy_details, state_value) in zip(assigned_powers, step_args): if has_auto_draw and policy_details is not None: policy_details['draw_action'] = env.get_draw_actions( )[power_name] power_variables[power_name]['orders'] += [orders] power_variables[power_name]['policy_details'] += [policy_details] power_variables[power_name]['state_values'] += [state_value] # Getting new state new_state_proto = extract_state_proto(env.game) # Storing reward for this transition done_reason = DoneReason(env.done_reason) if env.done_reason else None for power_name in powers: power_variables[power_name]['rewards'] += [ reward_fn.get_reward(prev_state_proto=state_proto, state_proto=new_state_proto, power_name=power_name, is_terminal_state=done_reason is not None, done_reason=done_reason) ] # Computing prev_orders_state for the previous state last_phase_proto = extract_phase_history_proto( env.game, nb_previous_phases=1)[-1] if last_phase_proto.name[-1] == 'M': prev_orders_state = proto_to_prev_orders_state( last_phase_proto, map_object).flatten().tolist() stored_prev_orders_state[last_phase_proto.name] = prev_orders_state last_phase_proto.prev_orders_state.extend(prev_orders_state) phase_history_proto += [last_phase_proto] # Sending partial game if: # 1) We have update_interval > 0 with an update queue, and # 2a) The game is completed, or 2b) the update time has elapsted and at least 5 years as passed has_update_interval = update_interval > 0 and update_queue is not None game_is_completed = env.is_done min_time_has_passed = time.time() - time_last_update > update_interval current_year = 9999 if env.game.get_current_phase( ) == 'COMPLETED' else int(env.game.get_current_phase()[1:5]) min_years_have_passed = current_year - year_last_update >= 5 if (has_update_interval and (game_is_completed or (min_time_has_passed and min_years_have_passed))): # Game is completed - last state value is 0 if game_is_completed: for power_name in powers: power_variables[power_name]['last_state_value'] = 0. # Otherwise - Querying the model for the value of the last state else: tasks = [ (player, new_state_proto, pow_name, phase_history_proto[-NB_PREV_ORDERS_HISTORY:], possible_orders_proto) for player, pow_name in zip(env.players, assigned_powers) ] last_state_values = yield [ get_state_value(*args) for args in tasks ] for power_name, last_state_value in zip( assigned_powers, last_state_values): power_variables[power_name][ 'last_state_value'] = last_state_value # Getting partial game and sending it on the update_queue saved_game_proto = get_saved_game_proto( env=env, players=players, stored_board_state=stored_board_state, stored_prev_orders_state=stored_prev_orders_state, stored_possible_orders=stored_possible_orders, power_variables=power_variables, start_phase_ix=start_phase_ix, reward_fn=reward_fn, advantage_fn=advantage_fn, is_partial_game=True) update_queue.put_nowait( (False, nb_transitions, proto_to_bytes(saved_game_proto))) # Updating stats start_phase_ix = current_phase_ix nb_transitions = 0 if not env.is_done: year_last_update = int(env.game.get_current_phase()[1:5]) # Since the environment is done (Completed game) - We can leave the last_state_value at 0. for power_name in powers: power_variables[power_name]['last_state_value'] = 0. # Getting completed game saved_game_proto = get_saved_game_proto( env=env, players=players, stored_board_state=stored_board_state, stored_prev_orders_state=stored_prev_orders_state, stored_possible_orders=stored_possible_orders, power_variables=power_variables, start_phase_ix=0, reward_fn=reward_fn, advantage_fn=advantage_fn, is_partial_game=False) # Converting to correct format output = { 'proto': lambda proto: proto, 'zlib': proto_to_zlib, 'bytes': proto_to_bytes }[output_format](saved_game_proto) # Returning return output