Beispiel #1
0
def test_proto_6():
    """ Tests the make_tensor_proto and make_nd_array function """
    from diplomacy_research.utils.tensorflow import tf
    tensor_1 = make_tensor_proto(0, dtype=np.float32, shape=[1, 0])
    tensor_2 = tf.make_tensor_proto(0, dtype=tf.float32, shape=[1, 0])
    array_1 = tf.make_ndarray(tensor_1)
    array_2 = make_ndarray(tensor_2)
    assert proto_to_bytes(tensor_1) == proto_to_bytes(tensor_2)
    assert array_1.tostring() == array_2.tostring()
    assert array_1.dtype == array_2.dtype
Beispiel #2
0
def test_proto_7():
    """ Tests the make_tensor_proto and make_nd_array function """
    from diplomacy_research.utils.tensorflow import tf
    random_tensor = np.random.rand(15, 25)
    tensor_1 = make_tensor_proto(random_tensor, dtype=np.float32)
    tensor_2 = tf.make_tensor_proto(random_tensor, dtype=tf.float32)
    array_1 = tf.make_ndarray(tensor_1)
    array_2 = make_ndarray(tensor_2)
    assert proto_to_bytes(tensor_1) == proto_to_bytes(tensor_2)
    assert array_1.tostring() == array_2.tostring()
    assert array_1.dtype == array_2.dtype
Beispiel #3
0
def test_proto_2():
    """ Tests the make_tensor_proto and make_nd_array function """
    from diplomacy_research.utils.tensorflow import tf
    tensor_1 = make_tensor_proto([bytes('', 'utf-8')],
                                 dtype=np.object,
                                 shape=[1])
    tensor_2 = tf.make_tensor_proto([bytes('', 'utf-8')],
                                    dtype=tf.string,
                                    shape=[1])
    array_1 = tf.make_ndarray(tensor_1)
    array_2 = make_ndarray(tensor_2)
    assert proto_to_bytes(tensor_1) == proto_to_bytes(tensor_2)
    assert array_1.tostring() == array_2.tostring()
    assert array_1.dtype == array_2.dtype
Beispiel #4
0
def generate_trajectory(players,
                        reward_fn,
                        advantage_fn,
                        env_constructor=None,
                        hparams=None,
                        power_assignments=None,
                        set_player_seed=None,
                        initial_state_bytes=None,
                        update_interval=0,
                        update_queue=None,
                        output_format='proto'):
    """ Generates a single trajectory (Saved Gamed Proto) for RL (self-play) with the power assigments
        :param players: A list of instantiated players
        :param reward_fn: The reward function to use to calculate rewards
        :param advantage_fn: An instance of `.models.self_play.advantages`
        :param env_constructor: A callable to get the OpenAI gym environment (args: players)
        :param hparams: A dictionary of hyper parameters with their values
        :param power_assignments: Optional. The power name we want to play as. (e.g. 'FRANCE') or a list of powers.
        :param set_player_seed: Boolean that indicates that we want to set the player seed on reset().
        :param initial_state_bytes: A `game.State` proto (in bytes format) representing the initial state of the game.
        :param update_interval: Optional. If set, a partial saved game is put in the update_queue this every seconds.
        :param update_queue: Optional. If update interval is set, partial games will be put in this queue
        :param output_format: The output format. One of 'proto', 'bytes', 'zlib'
        :return: A SavedGameProto representing the game played (with policy details and power assignments)
                 Depending on format, the output might be converted to a byte array, or a compressed byte array.
        :type players: List[diplomacy_research.players.player.Player]
        :type reward_fn: diplomacy_research.models.self_play.reward_functions.AbstractRewardFunction
        :type advantage_fn: diplomacy_research.models.self_play.advantages.base_advantage.BaseAdvantage
        :type update_queue: multiprocessing.Queue
    """
    # pylint: disable=too-many-arguments
    assert output_format in ['proto', 'bytes', 'zlib'
                             ], 'Format should be "proto", "bytes", "zlib"'
    assert len(players) == NB_POWERS

    # Making sure we use the SavedGame wrapper to record the game
    if env_constructor:
        env = env_constructor(players)
    else:
        env = default_env_constructor(players, hparams, power_assignments,
                                      set_player_seed, initial_state_bytes)
    wrapped_env = env
    while not isinstance(wrapped_env, DiplomacyEnv):
        if isinstance(wrapped_env, SaveGame):
            break
        wrapped_env = wrapped_env.env
    else:
        env = SaveGame(env)

    # Detecting if we have a Auto-Draw wrapper
    has_auto_draw = False
    wrapped_env = env
    while not isinstance(wrapped_env, DiplomacyEnv):
        if isinstance(wrapped_env, AutoDraw):
            has_auto_draw = True
            break
        wrapped_env = wrapped_env.env

    # Resetting env
    env.reset()

    # Timing vars for partial updates
    time_last_update = time.time()
    year_last_update = 0
    start_phase_ix = 0
    current_phase_ix = 0
    nb_transitions = 0

    # Cache Variables
    powers = sorted(
        [power_name for power_name in get_map_powers(env.game.map)])
    assigned_powers = env.get_all_powers_name()
    stored_board_state = OrderedDict()  # {phase_name: board_state}
    stored_prev_orders_state = OrderedDict()  # {phase_name: prev_orders_state}
    stored_possible_orders = OrderedDict()  # {phase_name: possible_orders}

    power_variables = {
        power_name: {
            'orders': [],
            'policy_details': [],
            'state_values': [],
            'rewards': [],
            'returns': [],
            'last_state_value': 0.
        }
        for power_name in powers
    }

    new_state_proto = None
    phase_history_proto = []
    map_object = Map(name=env.game.map.name)

    # Generating
    while not env.is_done:
        state_proto = new_state_proto if new_state_proto is not None else extract_state_proto(
            env.game)
        possible_orders_proto = extract_possible_orders_proto(env.game)

        # Computing board_state
        board_state = proto_to_board_state(state_proto,
                                           map_object).flatten().tolist()
        state_proto.board_state.extend(board_state)

        # Storing possible orders for this phase
        current_phase = env.game.get_current_phase()
        stored_board_state[current_phase] = board_state
        stored_possible_orders[current_phase] = possible_orders_proto

        # Getting orders, policy details, and state value
        tasks = [(player, state_proto, pow_name,
                  phase_history_proto[-NB_PREV_ORDERS_HISTORY:],
                  possible_orders_proto)
                 for player, pow_name in zip(env.players, assigned_powers)]
        step_args = yield [get_step_args(*args) for args in tasks]

        # Stepping through env, storing power variables
        for power_name, (orders, policy_details,
                         state_value) in zip(assigned_powers, step_args):
            if orders:
                env.step((power_name, orders))
                nb_transitions += 1
            if has_auto_draw and policy_details is not None:
                env.set_draw_prob(power_name, policy_details['draw_prob'])

        # Processing
        env.process()
        current_phase_ix += 1

        # Retrieving draw action and saving power variables
        for power_name, (orders, policy_details,
                         state_value) in zip(assigned_powers, step_args):
            if has_auto_draw and policy_details is not None:
                policy_details['draw_action'] = env.get_draw_actions(
                )[power_name]
            power_variables[power_name]['orders'] += [orders]
            power_variables[power_name]['policy_details'] += [policy_details]
            power_variables[power_name]['state_values'] += [state_value]

        # Getting new state
        new_state_proto = extract_state_proto(env.game)

        # Storing reward for this transition
        done_reason = DoneReason(env.done_reason) if env.done_reason else None
        for power_name in powers:
            power_variables[power_name]['rewards'] += [
                reward_fn.get_reward(prev_state_proto=state_proto,
                                     state_proto=new_state_proto,
                                     power_name=power_name,
                                     is_terminal_state=done_reason is not None,
                                     done_reason=done_reason)
            ]

        # Computing prev_orders_state for the previous state
        last_phase_proto = extract_phase_history_proto(
            env.game, nb_previous_phases=1)[-1]
        if last_phase_proto.name[-1] == 'M':
            prev_orders_state = proto_to_prev_orders_state(
                last_phase_proto, map_object).flatten().tolist()
            stored_prev_orders_state[last_phase_proto.name] = prev_orders_state
            last_phase_proto.prev_orders_state.extend(prev_orders_state)
            phase_history_proto += [last_phase_proto]

        # Sending partial game if:
        # 1) We have update_interval > 0 with an update queue, and
        # 2a) The game is completed, or 2b) the update time has elapsted and at least 5 years as passed
        has_update_interval = update_interval > 0 and update_queue is not None
        game_is_completed = env.is_done
        min_time_has_passed = time.time() - time_last_update > update_interval
        current_year = 9999 if env.game.get_current_phase(
        ) == 'COMPLETED' else int(env.game.get_current_phase()[1:5])
        min_years_have_passed = current_year - year_last_update >= 5

        if (has_update_interval
                and (game_is_completed or
                     (min_time_has_passed and min_years_have_passed))):

            # Game is completed - last state value is 0
            if game_is_completed:
                for power_name in powers:
                    power_variables[power_name]['last_state_value'] = 0.

            # Otherwise - Querying the model for the value of the last state
            else:
                tasks = [
                    (player, new_state_proto, pow_name,
                     phase_history_proto[-NB_PREV_ORDERS_HISTORY:],
                     possible_orders_proto)
                    for player, pow_name in zip(env.players, assigned_powers)
                ]
                last_state_values = yield [
                    get_state_value(*args) for args in tasks
                ]

                for power_name, last_state_value in zip(
                        assigned_powers, last_state_values):
                    power_variables[power_name][
                        'last_state_value'] = last_state_value

            # Getting partial game and sending it on the update_queue
            saved_game_proto = get_saved_game_proto(
                env=env,
                players=players,
                stored_board_state=stored_board_state,
                stored_prev_orders_state=stored_prev_orders_state,
                stored_possible_orders=stored_possible_orders,
                power_variables=power_variables,
                start_phase_ix=start_phase_ix,
                reward_fn=reward_fn,
                advantage_fn=advantage_fn,
                is_partial_game=True)
            update_queue.put_nowait(
                (False, nb_transitions, proto_to_bytes(saved_game_proto)))

            # Updating stats
            start_phase_ix = current_phase_ix
            nb_transitions = 0
            if not env.is_done:
                year_last_update = int(env.game.get_current_phase()[1:5])

    # Since the environment is done (Completed game) - We can leave the last_state_value at 0.
    for power_name in powers:
        power_variables[power_name]['last_state_value'] = 0.

    # Getting completed game
    saved_game_proto = get_saved_game_proto(
        env=env,
        players=players,
        stored_board_state=stored_board_state,
        stored_prev_orders_state=stored_prev_orders_state,
        stored_possible_orders=stored_possible_orders,
        power_variables=power_variables,
        start_phase_ix=0,
        reward_fn=reward_fn,
        advantage_fn=advantage_fn,
        is_partial_game=False)

    # Converting to correct format
    output = {
        'proto': lambda proto: proto,
        'zlib': proto_to_zlib,
        'bytes': proto_to_bytes
    }[output_format](saved_game_proto)

    # Returning
    return output
Beispiel #5
0
def test_to_from_bytes():
    """ Tests proto_to_bytes and bytes_to_proto """
    message_proto = _get_message()
    message_bytes = proto_to_bytes(message_proto)
    new_message_proto = bytes_to_proto(message_bytes, Message)
    _compare_messages(message_proto, new_message_proto)
Beispiel #6
0
def start_game_generator(adapter_ctor, dataset_builder_ctor, reward_fn,
                         advantage_fn, hparams, cluster_config, process_pool,
                         games_queue, transitions_queue):
    """ Start the game generator (to generate an infinite number of training games)
        :param adapter_ctor: The constructor to build the adapter to query orders, values and policy details
        :param dataset_builder_ctor: The constructor of `BaseBuilder` to set the required proto fields
        :param reward_fn: The reward function to use (Instance of.models.self_play.reward_functions`).
        :param advantage_fn: An instance of `.models.self_play.advantages`
        :param hparams: A dictionary of hyper-parameters
        :param cluster_config: The cluster configuration to use for distributed training
        :param process_pool: Optional. A ProcessPoolExecutor that was forked before TF and gRPC were loaded.
        :param games_queue: Queue to be used by processes to send games to the aggregator.
        :param transitions_queue: Inbound queue to receive the number of transitions and version updates.
        :return: Nothing
        :type adapter_ctor: diplomacy_research.models.policy.base_policy_adapter.BasePolicyAdapter.__class__
        :type dataset_builder_ctor: diplomacy_research.models.datasets.base_builder.BaseBuilder.__class__
        :type reward_fn: diplomacy_research.models.self_play.reward_functions.AbstractRewardFunction
        :type advantage_fn: diplomacy_research.models.self_play.advantages.base_advantage.BaseAdvantage
        :type cluster_config: diplomacy_research.utils.cluster.ClusterConfig
        :type process_pool: diplomacy_research.utils.executor.ProcessPoolExecutor
        :type games_queue: multiprocessing.Queue
        :type transitions_queue: multiprocessing.Queue
    """
    # pylint: disable=too-many-arguments
    memory_buffer = MemoryBuffer(cluster_config, hparams)
    nb_cores = multiprocessing.cpu_count()
    futures = []

    nb_pending_transitions = 0  # For throttling if there are enough transitions for the learner
    nb_rl_agents = get_nb_rl_agents(hparams['mode'])

    # 1) Finding the right function to create player
    get_players_callable = {
        'supervised': get_train_supervised_players,
        'self-play': get_train_self_play_players,
        'staggered': get_train_staggered_players
    }[hparams['mode']]

    # Generating an infinite number of games
    while True:
        nb_new_transitions = 0

        # 1) Detecting the number of pending transactions to throttle if necessary
        while not transitions_queue.empty():
            item = transitions_queue.get()
            if item == NEW_VERSION:
                nb_pending_transitions = 0
                nb_cores = multiprocessing.cpu_count()
            else:
                nb_pending_transitions += nb_rl_agents * item / NB_POWERS
                nb_new_transitions += nb_rl_agents * item / NB_POWERS

        futures = [fut for fut in futures if not fut.done()]
        nb_games_being_generated = len(futures)

        # Finding the number of games to generate
        nb_new_games = nb_cores - nb_games_being_generated
        if nb_new_games <= 0:
            continue

        # 2) Generating the get_player_kwargs
        get_players_kwargs = [{
            'adapter_ctor':
            adapter_ctor,
            'dataset_builder_ctor':
            dataset_builder_ctor,
            'tf_serving_port':
            get_tf_serving_port(cluster_config, serving_id=0),
            'cluster_config':
            cluster_config,
            'hparams':
            hparams
        }] * nb_new_games

        # 3) Generating gen_trajectory_kwargs
        gen_trajectory_kwargs = []
        for _ in range(nb_new_games):
            gen_trajectory_kwargs += [{
                'hparams':
                hparams,
                'reward_fn':
                reward_fn,
                'advantage_fn':
                advantage_fn,
                'power_assignments':
                hparams.get('power', '') or random.choice(ALL_POWERS),
                'set_player_seed':
                bool(hparams['dropout_rate']),
                'update_interval':
                hparams['update_interval'],
                'update_queue':
                games_queue
            }]

        # 4) Adding initial states if required
        if hparams['start_strategy'] == 'uniform':
            initial_states_proto = get_uniform_initial_states(
                memory_buffer, nb_new_games)
            for game_ix, initial_state_proto in enumerate(
                    initial_states_proto):
                gen_trajectory_kwargs[game_ix][
                    'initial_state_bytes'] = proto_to_bytes(
                        initial_state_proto)

        elif hparams['start_strategy'] == 'backplay':
            winning_power_names = []
            for kwargs in gen_trajectory_kwargs:
                if isinstance(kwargs['power_assignments'], list):
                    winning_power_names += [kwargs['power_assignments'][0]]
                else:
                    winning_power_names += [kwargs['power_assignments']]
            version_id = general_ops.get_version_id(memory_buffer)
            initial_states_proto = get_backplay_initial_states(
                memory_buffer, winning_power_names, version_id)
            for game_ix, initial_state_proto in enumerate(
                    initial_states_proto):
                gen_trajectory_kwargs[game_ix][
                    'initial_state_bytes'] = proto_to_bytes(
                        initial_state_proto)

        # 6) Launching jobs using current pool
        tasks = []
        for player_kwargs, trajectory_kwargs in zip(get_players_kwargs,
                                                    gen_trajectory_kwargs):
            tasks += [{
                'get_players_callable': get_players_callable,
                'get_players_kwargs': player_kwargs,
                'generate_trajectory_kwargs': trajectory_kwargs,
                'queue': games_queue
            }]
        futures += [
            process_pool.submit(start_game_process, kwargs) for kwargs in tasks
        ]