Esempio n. 1
0
    def run(self):
        #import tensorflow as tf
        self.env = rlcard3.make('blackjack')
        self.sess = tf.Session()
        agent = DQNAgent(self.sess,
                         scope='sub-dqn' + str(self.index),
                         action_num=self.env.action_num,
                         replay_memory_init_size=memory_init_size,
                         state_shape=self.env.state_shape,
                         mlp_layers=[10, 10])
        self.env.set_agents([agent])
        self.sess.run(tf.global_variables_initializer())

        # normalize
        for _ in range(norm_step):
            trajectories, _ = self.env.run()
            for ts in trajectories[0]:
                agent.feed(ts)

        # Receive instruction to run game and generate trajectories
        while True:
            instruction = self.input_queue.get()
            if instruction is not None:
                tasks, train_flag, variables, total_t = instruction

                # For evaluation
                if not train_flag:
                    agent.total_t = total_t
                    global_vars = [tf.convert_to_tensor(var) for var in variables]
                    agent.copy_params_op(global_vars)
                    for _ in range(tasks):
                        _, payoffs = self.env.run(is_training=train_flag)
                        self.output_queue.put(payoffs)

                # For training
                else:
                    for _ in range(tasks):
                        trajectories, _ = self.env.run(is_training=train_flag)
                        self.output_queue.put(trajectories)
                self.input_queue.task_done()
            else:
                self.input_queue.task_done()
                break
        self.sess.close()
        return
Esempio n. 2
0
    def test_train(self):

        memory_init_size = 100
        step_num = 1500

        sess = tf.InteractiveSession()
        tf.Variable(0, name='global_step', trainable=False)
        agent = DQNAgent(sess=sess,
                         scope='dqn',
                         replay_memory_size=500,
                         replay_memory_init_size=memory_init_size,
                         update_target_estimator_every=100,
                         state_shape=[2],
                         mlp_layers=[10, 10])
        sess.run(tf.global_variables_initializer())

        predicted_action, _ = agent.eval_step({
            'obs':
            np.random.random_sample((2, )),
            'legal_actions': [0, 1]
        })
        self.assertGreaterEqual(predicted_action, 0)
        self.assertLessEqual(predicted_action, 1)

        for _ in range(step_num):
            ts = [{
                'obs': np.random.random_sample((2, )),
                'legal_actions': [0, 1]
            },
                  np.random.randint(2), 0, {
                      'obs': np.random.random_sample((2, )),
                      'legal_actions': [0, 1]
                  }, True]
            agent.feed(ts)

        predicted_action = agent.step({
            'obs': np.random.random_sample((2, )),
            'legal_actions': [0, 1]
        })
        self.assertGreaterEqual(predicted_action, 0)
        self.assertLessEqual(predicted_action, 1)

        sess.close()
        tf.reset_default_graph()
Esempio n. 3
0
    ## Log Game info
    logger.log('\n########## Game information ##########')
    logger.log('\nNumPlayers: {}, NumCards: {}, Episodes: {}'.format(
        env.game.num_players, env.game.num_cards, episode_num))

    # logger.log(f'\nTrain Agents:{get_agent_str(env_agent_list)}')
    # logger.log(f'\nEval Agents:{get_agent_str(eval_agent_list)}')
    for episode in range(episode_num):

        # Generate data from the environment
        trajectories, _ = env.run(is_training=True)

        # Feed transitions into agent memory, and train the agent
        for ts in trajectories[0]:
            agent.feed(ts)
        # Evaluate the performance. Play with random agents.
        if episode % evaluate_every == 0:
            logger.log_performance(env.timestep,
                                   tournament(eval_env, evaluate_num)[0],
                                   episode=episode)

    # Close files in the logger
    logger.close_files()

    # Plot the learning curve
    logger.plot('DQN RA')

    # Save model
    save_dir = 'models/mocsar_dqn_ra'
    if not os.path.exists(save_dir):
Esempio n. 4
0
class MocsarPretrainddDqnAgent(Agent):
    """ Mocsar Rule agent version 1, take the minimal action
    """
    name: str  # Name of the agent
    id: str  # ID of the Agent
    agent: DQNAgent  # the pre-trained agent

    def __init__(self):
        self.name = 'PreDQNAgent'
        self.id = "d"
        # Set up the DQN agent and load the pre-trained model
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)
        self.use_raw = False
        # Config
        conf = Config('environ.properties')
        # Set the the number of steps for collecting normalization statistics
        # and intial memory size
        memory_init_size = conf.get_int('memory_init_size')
        norm_step = conf.get_int('norm_step')
        env = rlcard3.make('mocsar_dqn')
        with self.graph.as_default():
            self.agent = DQNAgent(self.sess,
                                  scope='dqn',
                                  action_num=env.action_num,
                                  state_shape=env.state_shape,
                                  replay_memory_size=20000,
                                  replay_memory_init_size=memory_init_size,
                                  norm_step=norm_step,
                                  mlp_layers=[512, 512])
            self.normalize(env, 1000)
            self.sess.run(tf.compat.v1.global_variables_initializer())
        check_point_path = os.path.join(ROOT_PATH, 'mocsar_dqn')
        with self.sess.as_default():
            with self.graph.as_default():
                saver = tf.train.Saver(tf.model_variables())
                saver.restore(self.sess,
                              tf.train.latest_checkpoint(check_point_path))

    def __str__(self):
        return f"Agent:{self.name}"

    def step(self, state: Dict) -> str:
        """ Predict the action given raw state. A naive rule.
        Choose the minimal action.

        Args:
            state (dict): Raw state from the game

        Returns:
            action (str): Predicted action
        """
        is_extract = state['is_extract']
        action_ids = get_action_ids(legal_actions=state['legal_actions'],
                                    is_extracted=is_extract)
        if len(action_ids) == 1:
            # Ha nincs miből választani
            return action_to_ret(action_ids[0], is_extract)

        if not is_extract:
            obs = encode_to_obs(state=state)

            extracted_state = {
                'obs':
                obs,
                'legal_actions': [
                    string_to_action(action)
                    for action in state['legal_actions']
                ],
                'is_extract':
                True  # State is extracted
            }
        else:
            extracted_state = state
        action = self.agent.step(state=extracted_state)
        return action_to_ret(action=action, is_extracted=is_extract)

    def eval_step(self, state: Dict):
        """ Step for evaluation. The same to step
                """
        return self.step(state), []

    def normalize(self, e, num):
        """ Feed random data to normalizer

        Args:
            e (Env): AN Env class

            num (int): The number of steps to be normalized

        """
        print('**********Normalize begin**************')
        begin_step = e.timestep
        e.set_agents([RandomAgent() for _ in range(e.player_num)])
        while e.timestep - begin_step < num:
            trajectories, _ = e.run(is_training=False)

            for tra in trajectories:
                for ts in tra:
                    self.agent.feed(ts)
        print('**********Normalize end**************')
Esempio n. 5
0
class NFSPAgent(object):
    ''' NFSP Agent implementation in TensorFlow.
    '''
    def __init__(self,
                 sess,
                 scope,
                 action_num=4,
                 state_shape=None,
                 hidden_layers_sizes=None,
                 reservoir_buffer_capacity=int(1e6),
                 anticipatory_param=0.1,
                 batch_size=256,
                 train_every=1,
                 rl_learning_rate=0.1,
                 sl_learning_rate=0.005,
                 min_buffer_size_to_learn=1000,
                 q_replay_memory_size=30000,
                 q_replay_memory_init_size=1000,
                 q_update_target_estimator_every=1000,
                 q_discount_factor=0.99,
                 q_epsilon_start=0.06,
                 q_epsilon_end=0,
                 q_epsilon_decay_steps=int(1e6),
                 q_batch_size=256,
                 q_train_every=1,
                 q_mlp_layers=None,
                 evaluate_with='average_policy'):
        ''' Initialize the NFSP agent.

        Args:
            sess (tf.Session): Tensorflow session object.
            scope (string): The name scope of NFSPAgent.
            action_num (int): The number of actions.
            state_shape (list): The shape of the state space.
            hidden_layers_sizes (list): The hidden layers sizes for the layers of
              the average policy.
            reservoir_buffer_capacity (int): The size of the buffer for average policy.
            anticipatory_param (float): The hyper-parameter that balances rl/avarage policy.
            batch_size (int): The batch_size for training average policy.
            train_every (int): Train the SL policy every X steps.
            rl_learning_rate (float): The learning rate of the RL agent.
            sl_learning_rate (float): the learning rate of the average policy.
            min_buffer_size_to_learn (int): The minimum buffer size to learn for average policy.
            q_replay_memory_size (int): The memory size of inner DQN agent.
            q_replay_memory_init_size (int): The initial memory size of inner DQN agent.
            q_update_target_estimator_every (int): The frequency of updating target network for
              inner DQN agent.
            q_discount_factor (float): The discount factor of inner DQN agent.
            q_epsilon_start (float): The starting epsilon of inner DQN agent.
            q_epsilon_end (float): the end epsilon of inner DQN agent.
            q_epsilon_decay_steps (int): The decay steps of inner DQN agent.
            q_batch_size (int): The batch size of inner DQN agent.
            q_train_step (int): Train the model every X steps.
            q_mlp_layers (list): The layer sizes of inner DQN agent.
            evaluate_with (string): The value can be 'best_response' or 'average_policy'
        '''
        self.use_raw = False
        self._sess = sess
        self._scope = scope
        self._action_num = action_num
        self._state_shape = state_shape
        self._layer_sizes = hidden_layers_sizes
        self._batch_size = batch_size
        self._train_every = train_every
        self._sl_learning_rate = sl_learning_rate
        self._anticipatory_param = anticipatory_param
        self._min_buffer_size_to_learn = min_buffer_size_to_learn

        self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity)
        self._prev_timestep = None
        self._prev_action = None
        self.evaluate_with = evaluate_with

        # Total timesteps
        self.total_t = 0

        # Step counter to keep track of learning.
        self._step_counter = 0

        with tf.variable_scope(scope):
            # Inner RL agent
            self._rl_agent = DQNAgent(
                sess, scope + '_dqn', q_replay_memory_size,
                q_replay_memory_init_size, q_update_target_estimator_every,
                q_discount_factor, q_epsilon_start, q_epsilon_end,
                q_epsilon_decay_steps, q_batch_size, action_num, state_shape,
                q_train_every, q_mlp_layers, rl_learning_rate)

            with tf.variable_scope('sl'):
                # Build supervised model
                self._build_model()

        self.sample_episode_policy()

    def _build_model(self):
        ''' build the model for supervised learning
        '''
        # Placeholders.
        input_shape = [None]
        input_shape.extend(self._state_shape)
        self._info_state_ph = tf.placeholder(shape=input_shape,
                                             dtype=tf.float32)

        self._X = tf.contrib.layers.flatten(self._info_state_ph)

        # Boolean to indicate whether is training or not
        self.is_train = tf.placeholder(tf.bool, name="is_train")

        # Batch Normalization
        self._X = tf.layers.batch_normalization(self._X, training=True)

        self._action_probs_ph = tf.placeholder(shape=[None, self._action_num],
                                               dtype=tf.float32)

        # Average policy network.
        fc = self._X
        for dim in self._layer_sizes:
            fc = tf.contrib.layers.fully_connected(fc,
                                                   dim,
                                                   activation_fn=tf.tanh)
        self._avg_policy = tf.contrib.layers.fully_connected(
            fc, self._action_num, activation_fn=None)
        self._avg_policy_probs = tf.nn.softmax(self._avg_policy)

        # Loss
        self._loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits_v2(
                labels=tf.stop_gradient(self._action_probs_ph),
                logits=self._avg_policy))

        optimizer = tf.train.AdamOptimizer(
            learning_rate=self._sl_learning_rate, name='nfsp_adam')

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                       scope=tf.get_variable_scope().name)
        with tf.control_dependencies(update_ops):
            self._learn_step = optimizer.minimize(self._loss)

    def feed(self, ts):
        ''' Feed data to inner RL agent

        Args:
            ts (list): A list of 5 elements that represent the transition.
        '''
        self._rl_agent.feed(ts)
        self.total_t += 1
        if self.total_t > 0 and len(
                self._reservoir_buffer
        ) >= self._min_buffer_size_to_learn and self.total_t % self._train_every == 0:
            sl_loss = self.train_sl()
            print('\rINFO - Agent {}, step {}, sl-loss: {}'.format(
                self._scope, self.total_t, sl_loss),
                  end='')

    def step(self, state):
        ''' Returns the action to be taken.

        Args:
            state (dict): The current state

        Returns:
            action (int): An action id
        '''
        obs = state['obs']
        legal_actions = state['legal_actions']
        if self._mode == MODE.best_response:
            probs = self._rl_agent.predict(obs)
            one_hot = np.eye(len(probs))[np.argmax(probs)]
            self._add_transition(obs, one_hot)

        elif self._mode == MODE.average_policy:
            probs = self._act(obs)

        probs = remove_illegal(probs, legal_actions)
        action = np.random.choice(len(probs), p=probs)

        return action

    def eval_step(self, state):
        ''' Use the average policy for evaluation purpose

        Args:
            state (dict): The current state.

        Returns:
            action (int): An action id.
            probs (list): The list of action probabilies
        '''
        if self.evaluate_with == 'best_response':
            action, probs = self._rl_agent.eval_step(state)
        elif self.evaluate_with == 'average_policy':
            obs = state['obs']
            legal_actions = state['legal_actions']
            probs = self._act(obs)
            probs = remove_illegal(probs, legal_actions)
            action = np.random.choice(len(probs), p=probs)
        else:
            raise ValueError(
                "'evaluate_with' should be either 'average_policy' or 'best_response'."
            )
        return action, probs

    def sample_episode_policy(self):
        ''' Sample average/best_response policy
        '''
        if np.random.rand() < self._anticipatory_param:
            self._mode = MODE.best_response
        else:
            self._mode = MODE.average_policy

    def _act(self, info_state):
        ''' Predict action probability givin the observation and legal actions

        Args:
            info_state (numpy.array): An obervation.

        Returns:
            action_probs (numpy.array): The predicted action probability.
        '''
        info_state = np.expand_dims(info_state, axis=0)
        action_probs = self._sess.run(self._avg_policy_probs,
                                      feed_dict={
                                          self._info_state_ph: info_state,
                                          self.is_train: False
                                      })[0]

        return action_probs

    def _add_transition(self, state, probs):
        ''' Adds the new transition to the reservoir buffer.

        Transitions are in the form (state, probs).

        Args:
            state (numpy.array): The state.
            probs (numpy.array): The probabilities of each action.
        '''
        transition = Transition(info_state=state, action_probs=probs)
        self._reservoir_buffer.add(transition)

    def train_sl(self):
        ''' Compute the loss on sampled transitions and perform a avg-network update.

        If there are not enough elements in the buffer, no loss is computed and
        `None` is returned instead.

        Returns:
            loss (float): The average loss obtained on this batch of transitions or `None`.
        '''
        if (len(self._reservoir_buffer) < self._batch_size or
                len(self._reservoir_buffer) < self._min_buffer_size_to_learn):
            return None

        transitions = self._reservoir_buffer.sample(self._batch_size)
        info_states = [t.info_state for t in transitions]
        action_probs = [t.action_probs for t in transitions]

        loss, _ = self._sess.run(
            [self._loss, self._learn_step],
            feed_dict={
                self._info_state_ph: info_states,
                self._action_probs_ph: action_probs,
                self.is_train: True,
            })

        return loss