Exemple #1
0
    def test_train(self):

        norm_step = 1100
        memory_init_size = 100
        step_num = 1500

        sess = tf.InteractiveSession()
        tf.Variable(0, name='global_step', trainable=False)
        agent = DQNAgent(sess=sess,
                         scope='dqn',
                         replay_memory_size = 500,
                         replay_memory_init_size=memory_init_size,
                         update_target_estimator_every=100,
                         norm_step=norm_step,
                         state_shape=[2],
                         mlp_layers=[10,10])
        sess.run(tf.global_variables_initializer())

        predicted_action = agent.eval_step({'obs': np.random.random_sample((2,)), 'legal_actions': [0, 1]})
        self.assertGreaterEqual(predicted_action, 0)
        self.assertLessEqual(predicted_action, 1)

        for step in range(step_num):
            ts = [{'obs': np.random.random_sample((2,)), 'legal_actions': [0, 1]}, np.random.randint(2), 0, {'obs': np.random.random_sample((2,)), 'legal_actions': [0, 1]}, True]
            agent.feed(ts)
            if step > norm_step + memory_init_size:
                agent.train()

        predicted_action = agent.step({'obs': np.random.random_sample((2,)), 'legal_actions': [0, 1]})
        self.assertGreaterEqual(predicted_action, 0)
        self.assertLessEqual(predicted_action, 1)

        sess.close()
        tf.reset_default_graph()
Exemple #2
0
    def __init__(self, graph, sess):
        """ Load pretrained model
        """
        super().__init__()
        self.graph = graph
        self.sess = sess

        env = rlcard.make('tarot')
        with self.graph.as_default():
            self.dqn_agent = DQNAgent(
                self.sess,
                scope='dqn',
                action_num=78,  # env.action_num,
                replay_memory_size=20000,
                replay_memory_init_size=memory_init_size,
                norm_step=norm_step,
                state_shape=env.state_shape,
                mlp_layers=[512, 1024, 512])
            normalize(env, self.dqn_agent, 1000)
            self.sess.run(tf.compat.v1.global_variables_initializer())

        check_point_path = os.path.join(ROOT_PATH, 'tarot_v180007')
        with self.sess.as_default():
            with self.graph.as_default():
                saver = tf.compat.v1.train.Saver(
                    tf.compat.v1.model_variables())
                saver.restore(self.sess,
                              tf.train.latest_checkpoint(check_point_path))
Exemple #3
0
    def __init__(self):
        ''' Load pretrained model
        '''
        import tensorflow as tf

        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)

        env = YanivEnv({})
        with self.graph.as_default():
            self.dqn_agents = []
            agent = DQNAgent(self.sess,
                             scope='dqn',
                             action_num=env.action_num,
                             replay_memory_init_size=1000,
                             train_every=1,
                             state_shape=env.state_shape,
                             mlp_layers=[512, 512]
                             )
            self.dqn_agents.append(agent)

        with self.sess.as_default():
            with self.graph.as_default():
                saver = tf.train.Saver()
                saver.restore(self.sess, tf.train.latest_checkpoint('models/yaniv_dqn'))
Exemple #4
0
    def test_init(self):

        sess = tf.compat.v1.InteractiveSession()
        tf.Variable(0, name='global_step', trainable=False)

        agent = DQNAgent(sess=sess,
                         scope='dqn',
                         replay_memory_size=0,
                         replay_memory_init_size=0,
                         update_target_estimator_every=0,
                         discount_factor=0,
                         epsilon_start=0,
                         epsilon_end=0,
                         epsilon_decay_steps=0,
                         batch_size=0,
                         action_num=2,
                         state_shape=[1],
                         mlp_layers=[10, 10])

        self.assertEqual(agent.replay_memory_init_size, 0)
        self.assertEqual(agent.update_target_estimator_every, 0)
        self.assertEqual(agent.discount_factor, 0)
        self.assertEqual(agent.epsilon_decay_steps, 0)
        self.assertEqual(agent.batch_size, 0)
        self.assertEqual(agent.action_num, 2)

        sess.close()
        tf.compat.v1.reset_default_graph()
Exemple #5
0
def load_dqn_leduc_agent(model_path):
    # Set a global seed
    set_global_seed(0)

    # Load pretrained model
    # tf.reset_default_graph()
    graph = tf.Graph()
    sess = tf.Session(graph=graph)

    with graph.as_default():
        nfsp_agents = []
        agent = DQNAgent(sess,
                         scope='dqn',
                         action_num=env.action_num,
                         replay_memory_init_size=memory_init_size,
                         train_every=train_every,
                         state_shape=env.state_shape,
                         mlp_layers=[128, 128])

    # We have a pretrained model here. Change the path for your model.
    # check_point_path = os.path.join(rlcard.__path__[0], 'models/pretrained/leduc_holdem_nfsp')
    check_point_path = model_path

    with sess.as_default():
        with graph.as_default():
            saver = tf.train.Saver()
            saver.restore(sess, tf.train.latest_checkpoint(check_point_path))
    return agent
Exemple #6
0
    def run(self):
        import tensorflow as tf
        self.env = rlcard.make('leduc-holdem')
        self.sess = tf.Session()
        agent = DQNAgent(self.sess,
                         scope='sub-dqn' + str(self.index),
                         action_num=self.env.action_num,
                         replay_memory_init_size=memory_init_size,
                         train_every=train_every,
                         state_shape=self.env.state_shape,
                         mlp_layers=[128, 128])
        random_agent = RandomAgent(action_num=self.env.action_num)
        self.env.set_agents([agent, random_agent])
        self.sess.run(tf.global_variables_initializer())

        # Receive instruction to run game and generate trajectories
        while True:
            instruction = self.input_queue.get()
            if instruction is not None:
                tasks, train_flag, variables, total_t = instruction

                # For evaluation
                if not train_flag:
                    agent.total_t = total_t
                    global_vars = [
                        tf.convert_to_tensor(var) for var in variables
                    ]
                    agent.copy_params_op(global_vars)
                    for _ in range(tasks):
                        _, payoffs = self.env.run(is_training=train_flag)
                        self.output_queue.put(payoffs)

                # For training
                else:
                    for _ in range(tasks):
                        trajectories, _ = self.env.run(is_training=train_flag)
                        self.output_queue.put(
                            (trajectories, self.env.timestep))
                self.input_queue.task_done()
            else:
                self.input_queue.task_done()
                break
        self.sess.close()
        return
Exemple #7
0
 def __init__(self):
     env = rlcard.make('uno')
     self.sess1 = tf.compat.v1.Session()
     global_step = tf.Variable(0, name='global_step', trainable=False)
     self.agent = DQNAgent(self.sess1,
                           scope='dqn',
                           action_num=env.action_num,
                           replay_memory_init_size=memory_init_size,
                           norm_step=norm_step,
                           state_shape=env.state_shape,
                           mlp_layers=[100, 100])
     self.sess1.run(tf.global_variables_initializer())
     self.saver = tf.train.Saver()
     self.saver.restore(self.sess1,
                        './experiments/uno_dqn_result/models/model1.ckpt')
Exemple #8
0
    def test_init(self):

        agent = DQNAgent(replay_memory_size=0,
                         replay_memory_init_size=0,
                         update_target_estimator_every=0,
                         discount_factor=0,
                         epsilon_start=0,
                         epsilon_end=0,
                         epsilon_decay_steps=0,
                         batch_size=0,
                         num_actions=2,
                         state_shape=[1],
                         mlp_layers=[10,10],
                         device=torch.device('cpu'))

        self.assertEqual(agent.replay_memory_init_size, 0)
        self.assertEqual(agent.update_target_estimator_every, 0)
        self.assertEqual(agent.discount_factor, 0)
        self.assertEqual(agent.epsilon_decay_steps, 0)
        self.assertEqual(agent.batch_size, 0)
        self.assertEqual(agent.num_actions, 2)
Exemple #9
0
    def test_train(self):

        memory_init_size = 100
        num_steps = 500

        agent = DQNAgent(replay_memory_size = 200,
                         replay_memory_init_size=memory_init_size,
                         update_target_estimator_every=100,
                         state_shape=[2],
                         mlp_layers=[10,10],
                         device=torch.device('cpu'))

        predicted_action, _ = agent.eval_step({'obs': np.random.random_sample((2,)), 'legal_actions': {0: None, 1: None}, 'raw_legal_actions': ['call', 'raise']})
        self.assertGreaterEqual(predicted_action, 0)
        self.assertLessEqual(predicted_action, 1)

        for _ in range(num_steps):
            ts = [{'obs': np.random.random_sample((2,)), 'legal_actions': {0: None, 1: None}}, np.random.randint(2), 0, {'obs': np.random.random_sample((2,)), 'legal_actions': {0: None, 1: None}, 'raw_legal_actions': ['call', 'raise']}, True]
            agent.feed(ts)

        predicted_action = agent.step({'obs': np.random.random_sample((2,)), 'legal_actions': {0: None, 1: None}})
        self.assertGreaterEqual(predicted_action, 0)
        self.assertLessEqual(predicted_action, 1)
Exemple #10
0
    def __init__(self,
                 sess,
                 scope,
                 action_num=4,
                 state_shape=None,
                 hidden_layers_sizes=None,
                 reservoir_buffer_capacity=int(1e6),
                 anticipatory_param=0.1,
                 batch_size=256,
                 train_every=1,
                 rl_learning_rate=0.1,
                 sl_learning_rate=0.005,
                 min_buffer_size_to_learn=1000,
                 q_replay_memory_size=30000,
                 q_replay_memory_init_size=1000,
                 q_update_target_estimator_every=1000,
                 q_discount_factor=0.99,
                 q_epsilon_start=0.06,
                 q_epsilon_end=0,
                 q_epsilon_decay_steps=int(1e6),
                 q_batch_size=256,
                 q_train_every=1,
                 q_mlp_layers=None,
                 evaluate_with='average_policy'):
        ''' Initialize the NFSP agent.

        Args:
            sess (tf.Session): Tensorflow session object.
            scope (string): The name scope of NFSPAgent.
            action_num (int): The number of actions.
            state_shape (list): The shape of the state space.
            hidden_layers_sizes (list): The hidden layers sizes for the layers of
              the average policy.
            reservoir_buffer_capacity (int): The size of the buffer for average policy.
            anticipatory_param (float): The hyper-parameter that balances rl/avarage policy.
            batch_size (int): The batch_size for training average policy.
            train_every (int): Train the SL policy every X steps.
            rl_learning_rate (float): The learning rate of the RL agent.
            sl_learning_rate (float): the learning rate of the average policy.
            min_buffer_size_to_learn (int): The minimum buffer size to learn for average policy.
            q_replay_memory_size (int): The memory size of inner DQN agent.
            q_replay_memory_init_size (int): The initial memory size of inner DQN agent.
            q_update_target_estimator_every (int): The frequency of updating target network for
              inner DQN agent.
            q_discount_factor (float): The discount factor of inner DQN agent.
            q_epsilon_start (float): The starting epsilon of inner DQN agent.
            q_epsilon_end (float): the end epsilon of inner DQN agent.
            q_epsilon_decay_steps (int): The decay steps of inner DQN agent.
            q_batch_size (int): The batch size of inner DQN agent.
            q_train_step (int): Train the model every X steps.
            q_mlp_layers (list): The layer sizes of inner DQN agent.
            evaluate_with (string): The value can be 'best_response' or 'average_policy'
        '''
        self.use_raw = False
        self._sess = sess
        self._scope = scope
        self._action_num = action_num
        self._state_shape = state_shape
        self._layer_sizes = hidden_layers_sizes
        self._batch_size = batch_size
        self._train_every = train_every
        self._sl_learning_rate = sl_learning_rate
        self._anticipatory_param = anticipatory_param
        self._min_buffer_size_to_learn = min_buffer_size_to_learn

        self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity)
        self._prev_timestep = None
        self._prev_action = None
        self.evaluate_with = evaluate_with

        self.d = {
            0: 'A',
            1: '2',
            2: '3',
            3: '4',
            4: '5',
            5: '6',
            6: '7',
            7: '8',
            8: '9',
            9: 'T',
            10: 'J',
            11: 'Q',
            12: 'K'
        }
        self.s = {0: 's', 1: 'h', 2: 'd', 3: 'c'}
        self.c2n = {
            '2': 2,
            '3': 3,
            '4': 4,
            '5': 5,
            '6': 6,
            '7': 7,
            '8': 8,
            '9': 9,
            'T': 10,
            'J': 11,
            'Q': 12,
            'K': 13,
            'A': 14
        }
        self.late_range = Range(
            '22+, A2s+, K2s+, Q2s+, J2s+, J8, T9, 98, 87, 76s, 65s, 54s, 98s+, K9+, Q8+, J7+, T6s+, A9+'
        )

        # Total timesteps
        self.total_t = 0

        # Step counter to keep track of learning.
        self._step_counter = 0

        with tf.variable_scope(scope):
            # Inner RL agent
            self._rl_agent = DQNAgent(
                sess, scope + '_dqn', q_replay_memory_size,
                q_replay_memory_init_size, q_update_target_estimator_every,
                q_discount_factor, q_epsilon_start, q_epsilon_end,
                q_epsilon_decay_steps, q_batch_size, action_num, state_shape,
                q_train_every, q_mlp_layers, rl_learning_rate)

            with tf.variable_scope('sl'):
                # Build supervised model
                self._build_model()

        self.sample_episode_policy()
Exemple #11
0
class NFSPAgent(object):
    ''' NFSP Agent implementation in TensorFlow.
    '''
    def __init__(self,
                 sess,
                 scope,
                 action_num=4,
                 state_shape=None,
                 hidden_layers_sizes=None,
                 reservoir_buffer_capacity=int(1e6),
                 anticipatory_param=0.1,
                 batch_size=256,
                 train_every=1,
                 rl_learning_rate=0.1,
                 sl_learning_rate=0.005,
                 min_buffer_size_to_learn=1000,
                 q_replay_memory_size=30000,
                 q_replay_memory_init_size=1000,
                 q_update_target_estimator_every=1000,
                 q_discount_factor=0.99,
                 q_epsilon_start=0.06,
                 q_epsilon_end=0,
                 q_epsilon_decay_steps=int(1e6),
                 q_batch_size=256,
                 q_train_every=1,
                 q_mlp_layers=None,
                 evaluate_with='average_policy'):
        ''' Initialize the NFSP agent.

        Args:
            sess (tf.Session): Tensorflow session object.
            scope (string): The name scope of NFSPAgent.
            action_num (int): The number of actions.
            state_shape (list): The shape of the state space.
            hidden_layers_sizes (list): The hidden layers sizes for the layers of
              the average policy.
            reservoir_buffer_capacity (int): The size of the buffer for average policy.
            anticipatory_param (float): The hyper-parameter that balances rl/avarage policy.
            batch_size (int): The batch_size for training average policy.
            train_every (int): Train the SL policy every X steps.
            rl_learning_rate (float): The learning rate of the RL agent.
            sl_learning_rate (float): the learning rate of the average policy.
            min_buffer_size_to_learn (int): The minimum buffer size to learn for average policy.
            q_replay_memory_size (int): The memory size of inner DQN agent.
            q_replay_memory_init_size (int): The initial memory size of inner DQN agent.
            q_update_target_estimator_every (int): The frequency of updating target network for
              inner DQN agent.
            q_discount_factor (float): The discount factor of inner DQN agent.
            q_epsilon_start (float): The starting epsilon of inner DQN agent.
            q_epsilon_end (float): the end epsilon of inner DQN agent.
            q_epsilon_decay_steps (int): The decay steps of inner DQN agent.
            q_batch_size (int): The batch size of inner DQN agent.
            q_train_step (int): Train the model every X steps.
            q_mlp_layers (list): The layer sizes of inner DQN agent.
            evaluate_with (string): The value can be 'best_response' or 'average_policy'
        '''
        self.use_raw = False
        self._sess = sess
        self._scope = scope
        self._action_num = action_num
        self._state_shape = state_shape
        self._layer_sizes = hidden_layers_sizes
        self._batch_size = batch_size
        self._train_every = train_every
        self._sl_learning_rate = sl_learning_rate
        self._anticipatory_param = anticipatory_param
        self._min_buffer_size_to_learn = min_buffer_size_to_learn

        self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity)
        self._prev_timestep = None
        self._prev_action = None
        self.evaluate_with = evaluate_with

        self.d = {
            0: 'A',
            1: '2',
            2: '3',
            3: '4',
            4: '5',
            5: '6',
            6: '7',
            7: '8',
            8: '9',
            9: 'T',
            10: 'J',
            11: 'Q',
            12: 'K'
        }
        self.s = {0: 's', 1: 'h', 2: 'd', 3: 'c'}
        self.c2n = {
            '2': 2,
            '3': 3,
            '4': 4,
            '5': 5,
            '6': 6,
            '7': 7,
            '8': 8,
            '9': 9,
            'T': 10,
            'J': 11,
            'Q': 12,
            'K': 13,
            'A': 14
        }
        self.late_range = Range(
            '22+, A2s+, K2s+, Q2s+, J2s+, J8, T9, 98, 87, 76s, 65s, 54s, 98s+, K9+, Q8+, J7+, T6s+, A9+'
        )

        # Total timesteps
        self.total_t = 0

        # Step counter to keep track of learning.
        self._step_counter = 0

        with tf.variable_scope(scope):
            # Inner RL agent
            self._rl_agent = DQNAgent(
                sess, scope + '_dqn', q_replay_memory_size,
                q_replay_memory_init_size, q_update_target_estimator_every,
                q_discount_factor, q_epsilon_start, q_epsilon_end,
                q_epsilon_decay_steps, q_batch_size, action_num, state_shape,
                q_train_every, q_mlp_layers, rl_learning_rate)

            with tf.variable_scope('sl'):
                # Build supervised model
                self._build_model()

        self.sample_episode_policy()

    def _build_model(self):
        ''' build the model for supervised learning
        '''
        # Placeholders.
        input_shape = [None]
        input_shape.extend(self._state_shape)
        self._info_state_ph = tf.placeholder(shape=input_shape,
                                             dtype=tf.float32)

        self._X = tf.contrib.layers.flatten(self._info_state_ph)

        # Boolean to indicate whether is training or not
        self.is_train = tf.placeholder(tf.bool, name="is_train")

        # Batch Normalization
        self._X = tf.layers.batch_normalization(self._X, training=True)

        self._action_probs_ph = tf.placeholder(shape=[None, self._action_num],
                                               dtype=tf.float32)

        # Average policy network.
        fc = self._X
        for dim in self._layer_sizes:
            fc = tf.contrib.layers.fully_connected(fc,
                                                   dim,
                                                   activation_fn=tf.tanh)
        self._avg_policy = tf.contrib.layers.fully_connected(
            fc, self._action_num, activation_fn=None)
        self._avg_policy_probs = tf.nn.softmax(self._avg_policy)

        # Loss
        self._loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits_v2(
                labels=tf.stop_gradient(self._action_probs_ph),
                logits=self._avg_policy))

        optimizer = tf.train.AdamOptimizer(
            learning_rate=self._sl_learning_rate, name='nfsp_adam')

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                       scope=tf.get_variable_scope().name)
        with tf.control_dependencies(update_ops):
            self._learn_step = optimizer.minimize(self._loss)

    def feed(self, ts):
        ''' Feed data to inner RL agent

        Args:
            ts (list): A list of 5 elements that represent the transition.
        '''
        self._rl_agent.feed(ts)
        self.total_t += 1
        if self.total_t > 0 and len(
                self._reservoir_buffer
        ) >= self._min_buffer_size_to_learn and self.total_t % self._train_every == 0:
            sl_loss = self.train_sl()
            print('\rINFO - Agent {}, step {}, sl-loss: {}'.format(
                self._scope, self.total_t, sl_loss),
                  end='')

    def step(self, state):
        ''' Returns the action to be taken.

        Args:
            state (dict): The current state

        Returns:
            action (int): An action id
        '''

        self.sample_episode_policy()

        cards = ''
        pos = 0

        for i in state['obs']:
            if (i == 1 and pos < 52):
                cards = cards + self.d[pos % 13] + '' + self.s[pos // 13]
            pos += 1
        # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos):
        #     return 0, 1

        tab = []
        handcards = cards

        for i in state['public_cards']:
            tab.append((self.c2n[i[1]], i[0].lower()))
        hand = []
        for i in range(0, len(handcards), 2):
            hand.append((self.c2n[handcards[i]], handcards[i + 1]))

        # print(tab)

        hand = [x for x in hand if x not in tab]
        stt = mcst.PokerState(hand, tab, state['cur'], state['opp'],
                              abs(state['obs'][-2] - state['obs'][-1]),
                              state['obs'][-2] + state['obs'][-1],
                              state['obs'][52], state['obs'][53])
        # print(hand, tab, 250 - min(state['obs'][-2:]), 250 - max(state['obs'][-2:]), abs(state['obs'][-2] - state['obs'][-1]), state['obs'][-2] + state['obs'][-1], min(state['obs'][-2:]), max(state['obs'][-2:]))
        # mcst.PokerState()

        obs = state['obs']
        legal_actions = state['legal_actions']

        par = mcst.MCTS(1)
        if self._mode == MODE.best_response:
            probs = self._rl_agent.predict(obs)
            m = par.UCT(rootstate=stt,
                        itermax=50000,
                        processes=16,
                        verbose=False)
            m = m[0]
            probs[m] += 1
            probs = remove_illegal(probs, legal_actions)
            probs /= sum(probs)

        elif self._mode == MODE.average_policy:
            probs = self._act(obs)
            one_hot = np.eye(len(probs))[np.argmax(probs)]
            self._add_transition(obs, one_hot)

        probs = remove_illegal(probs, legal_actions)
        action = np.random.choice(len(probs), p=probs)
        # print(m, action)
        return action

    def eval_step(self, state):
        ''' Use the average policy for evaluation purpose

        Args:
            state (dict): The current state.

        Returns:
            action (int): An action id.
            probs (list): The list of action probabilies
        '''

        cards = ''
        pos = 0
        for i in state['obs']:
            if (i == 1 and pos < 52):
                cards = cards + self.d[pos % 13] + '' + self.s[pos // 13]
            pos += 1
        # if(len(cards) == 4 and not Combo(cards) in self.late_range.combos):
        #     return 0, 1

        tab = []
        handcards = cards
        legal_actions = state['legal_actions']

        for i in state['public_cards']:
            tab.append((self.c2n[i[1]], i[0].lower()))
        hand = []
        for i in range(0, len(handcards), 2):
            hand.append((self.c2n[handcards[i]], handcards[i + 1]))

        hand = [x for x in hand if x not in tab]
        stt = mcst.PokerState(hand, tab, state['cur'], state['opp'],
                              abs(state['obs'][-2] - state['obs'][-1]),
                              state['obs'][-2] + state['obs'][-1],
                              state['obs'][52], state['obs'][53])
        par = mcst.MCTS(1)
        # print(state)
        if self.evaluate_with == 'best_response':
            action, probs = self._rl_agent.eval_step(state)
            m = par.UCT(rootstate=stt,
                        itermax=100000,
                        processes=32,
                        verbose=False)
            print(m, probs)
            m = m[0]
            probs[m] += 1
            # if probs[1] == probs[3] and probs[3] == probs[4] and probs[4] == probs[5]:
            #     probs[2] /= 25
            #     probs[m] += 2

            # elif not m == 5:
            #     probs[m] += 2
            # else:
            #     probs[4] += 3

            # if(len(tab) == 0):
            #     probs[5] = 0
            # else:
            #     probs[5] /= 4

            probs = remove_illegal(probs, legal_actions)
            probs /= sum(probs)

        elif self.evaluate_with == 'average_policy':
            obs = state['obs']
            probs = self._act(obs)

        else:
            raise ValueError(
                "'evaluate_with' should be either 'average_policy' or 'best_response'."
            )

        probs = remove_illegal(probs, legal_actions)
        action = np.random.choice(len(probs), p=probs)
        if (action == 0 and 1 in legal_actions):
            action = 1


#        print(action, probs)
        return action, probs

    def sample_episode_policy(self):
        ''' Sample average/best_response policy
        '''
        if np.random.rand() < self._anticipatory_param:
            self._mode = MODE.best_response
        else:
            self._mode = MODE.average_policy

    def _act(self, info_state):
        ''' Predict action probability givin the observation and legal actions

        Args:
            info_state (numpy.array): An obervation.

        Returns:
            action_probs (numpy.array): The predicted action probability.
        '''
        info_state = np.expand_dims(info_state, axis=0)
        action_probs = self._sess.run(self._avg_policy_probs,
                                      feed_dict={
                                          self._info_state_ph: info_state,
                                          self.is_train: False
                                      })[0]

        return action_probs

    def _add_transition(self, state, probs):
        ''' Adds the new transition to the reservoir buffer.

        Transitions are in the form (state, probs).

        Args:
            state (numpy.array): The state.
            probs (numpy.array): The probabilities of each action.
        '''
        transition = Transition(info_state=state, action_probs=probs)
        self._reservoir_buffer.add(transition)

    def train_sl(self):
        ''' Compute the loss on sampled transitions and perform a avg-network update.

        If there are not enough elements in the buffer, no loss is computed and
        `None` is returned instead.

        Returns:
            loss (float): The average loss obtained on this batch of transitions or `None`.
        '''
        if (len(self._reservoir_buffer) < self._batch_size or
                len(self._reservoir_buffer) < self._min_buffer_size_to_learn):
            return None

        transitions = self._reservoir_buffer.sample(self._batch_size)
        info_states = [t.info_state for t in transitions]
        action_probs = [t.action_probs for t in transitions]

        loss, _ = self._sess.run(
            [self._loss, self._learn_step],
            feed_dict={
                self._info_state_ph: info_states,
                self._action_probs_ph: action_probs,
                self.is_train: True,
            })

        return loss
Exemple #12
0
memory_init_size = 1000
norm_step = 100

# The paths for saving the logs and learning curves
root_path = './experiments/uno_dqnvsrandom_result/'
log_path = root_path + 'log.txt'
csv_path = root_path + 'performance.csv'
figure_path = root_path + 'figures/'

## Set a global seed
##set_global_seed(0)
if True:
    sess1 = tf.compat.v1.Session()

    # Set agents
    global_step = tf.Variable(0, name='global_step', trainable=False)
    agent = DQNAgent(sess1,
                     scope='dqn',
                     action_num=env.action_num,
                     replay_memory_init_size=memory_init_size,
                     norm_step=norm_step,
                     state_shape=env.state_shape,
                     mlp_layers=[100, 100])

    sess1.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess1, root_path + 'models/model1.ckpt')
    saver.save(sess1, './experiments/uno_dqn_result/models/modelvrandom1.ckpt')
    env.set_agents([agent])
    ishuman = [-1, 0]
    env.gamewithai(ishuman)
Exemple #13
0
# The paths for saving the logs and learning curves
root_path = './experiments/leduc_holdem_single_agent_dqn_result/'
log_path = root_path + 'log.txt'
csv_path = root_path + 'performance.csv'
figure_path = root_path + 'figures/'

# Set a global seed
set_global_seed(0)

with tf.Session() as sess:
    global_step = tf.Variable(0, name='global_step', trainable=False)
    agent = DQNAgent(sess,
                     scope='dqn',
                     action_num=env.action_num,
                     replay_memory_size=int(1e5),
                     replay_memory_init_size=memory_init_size,
                     norm_step=norm_step,
                     state_shape=env.state_shape,
                     mlp_layers=[128, 128])

    sess.run(tf.global_variables_initializer())

    # Init a Logger to plot the learning curve
    logger = Logger(xlabel='timestep',
                    ylabel='reward',
                    legend='DQN on Leduc Holdem',
                    log_path=log_path,
                    csv_path=csv_path)

    state = env.reset()
Exemple #14
0
    def __init__(self,
                 num_actions=4,
                 state_shape=None,
                 hidden_layers_sizes=None,
                 reservoir_buffer_capacity=20000,
                 anticipatory_param=0.1,
                 batch_size=256,
                 train_every=1,
                 rl_learning_rate=0.1,
                 sl_learning_rate=0.005,
                 min_buffer_size_to_learn=100,
                 q_replay_memory_size=20000,
                 q_replay_memory_init_size=100,
                 q_update_target_estimator_every=1000,
                 q_discount_factor=0.99,
                 q_epsilon_start=0.06,
                 q_epsilon_end=0,
                 q_epsilon_decay_steps=int(1e6),
                 q_batch_size=32,
                 q_train_every=1,
                 q_mlp_layers=None,
                 evaluate_with='average_policy',
                 device=None):
        ''' Initialize the NFSP agent.

        Args:
            num_actions (int): The number of actions.
            state_shape (list): The shape of the state space.
            hidden_layers_sizes (list): The hidden layers sizes for the layers of
              the average policy.
            reservoir_buffer_capacity (int): The size of the buffer for average policy.
            anticipatory_param (float): The hyper-parameter that balances rl/avarage policy.
            batch_size (int): The batch_size for training average policy.
            train_every (int): Train the SL policy every X steps.
            rl_learning_rate (float): The learning rate of the RL agent.
            sl_learning_rate (float): the learning rate of the average policy.
            min_buffer_size_to_learn (int): The minimum buffer size to learn for average policy.
            q_replay_memory_size (int): The memory size of inner DQN agent.
            q_replay_memory_init_size (int): The initial memory size of inner DQN agent.
            q_update_target_estimator_every (int): The frequency of updating target network for
              inner DQN agent.
            q_discount_factor (float): The discount factor of inner DQN agent.
            q_epsilon_start (float): The starting epsilon of inner DQN agent.
            q_epsilon_end (float): the end epsilon of inner DQN agent.
            q_epsilon_decay_steps (int): The decay steps of inner DQN agent.
            q_batch_size (int): The batch size of inner DQN agent.
            q_train_step (int): Train the model every X steps.
            q_mlp_layers (list): The layer sizes of inner DQN agent.
            device (torch.device): Whether to use the cpu or gpu
        '''
        self.use_raw = False
        self._num_actions = num_actions
        self._state_shape = state_shape
        self._layer_sizes = hidden_layers_sizes + [num_actions]
        self._batch_size = batch_size
        self._train_every = train_every
        self._sl_learning_rate = sl_learning_rate
        self._anticipatory_param = anticipatory_param
        self._min_buffer_size_to_learn = min_buffer_size_to_learn

        self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity)
        self._prev_timestep = None
        self._prev_action = None
        self.evaluate_with = evaluate_with

        if device is None:
            self.device = torch.device(
                'cuda:0' if torch.cuda.is_available() else 'cpu')
        else:
            self.device = device

        # Total timesteps
        self.total_t = 0

        # Step counter to keep track of learning.
        self._step_counter = 0

        # Build the action-value network
        self._rl_agent = DQNAgent(q_replay_memory_size, q_replay_memory_init_size, \
            q_update_target_estimator_every, q_discount_factor, q_epsilon_start, q_epsilon_end, \
            q_epsilon_decay_steps, q_batch_size, num_actions, state_shape, q_train_every, q_mlp_layers, \
            rl_learning_rate, device)

        # Build the average policy supervised model
        self._build_model()

        self.sample_episode_policy()
Exemple #15
0
# The paths for saving the logs and learning curves
log_dir = './experiments/blackjack_dqn_result/'

# Set a global seed
set_global_seed(0)

with tf.Session() as sess:

    # Initialize a global step
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Set up the agents
    agent = DQNAgent(sess,
                     scope='dqn',
                     action_num=env.action_num,
                     replay_memory_init_size=memory_init_size,
                     train_every=train_every,
                     state_shape=env.state_shape,
                     mlp_layers=[10,10])
    env.set_agents([agent])
    eval_env.set_agents([agent])

    # Initialize global variables
    sess.run(tf.global_variables_initializer())

    # Initialize a Logger to plot the learning curve
    logger = Logger(log_dir)

    for episode in range(episode_num):

        # Generate data from the environment
Exemple #16
0
root_path = './experiments/uno_dqn_result/'
log_path = root_path + 'log.txt'
csv_path = root_path + 'performance.csv'
figure_path = root_path + 'figures/'

# Set a global seed
set_global_seed(0)

with tf.Session() as sess:

    # Set agents
    global_step = tf.Variable(0, name='global_step', trainable=False)
    agent = DQNAgent(sess,
                     scope='dqn',
                     action_num=env.action_num,
                     replay_memory_init_size=memory_init_size,
                     norm_step=norm_step,
                     state_shape=env.state_shape,
                     mlp_layers=[10, 400])
    env.set_agents([agent, RandomAgent(action_num=env.action_num)])
    eval_env.set_agents([agent, RandomAgent(action_num=env.action_num)])

    sess.run(tf.global_variables_initializer())

    # Count the number of steps
    step_counter = 0

    # Init a Logger to plot the learning curve
    logger = Logger(xlabel='timestep',
                    ylabel='reward',
                    legend='DQN on UNO',
Exemple #17
0
# The paths for saving the logs and learning curves
log_dir = './experiments/uno_dqn_result/'

# Set a global seed
set_global_seed(0)

with tf.Session() as sess:

    # Initialize a global step
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Set up the agents
    agent = DQNAgent(sess,
                     scope='dqn',
                     action_num=env.action_num,
                     replay_memory_size=20000,
                     replay_memory_init_size=memory_init_size,
                     state_shape=env.state_shape,
                     mlp_layers=[512, 512])
    random_agent = RandomAgent(action_num=eval_env.action_num)
    env.set_agents([agent, random_agent, random_agent, random_agent])
    eval_env.set_agents([agent, random_agent, random_agent, random_agent])

    # Initialize global variables
    sess.run(tf.global_variables_initializer())

    # Init a Logger to plot the learning curve
    logger = Logger(log_dir)

    for episode in range(episode_num):
Exemple #18
0
root_path = './experiments/badugi_hand_dqn_result/'
log_path = root_path + 'log.txt'
csv_path = root_path + 'performance.csv'
figure_path = root_path + 'figures/'
checkpoint_path = root_path + 'ckpt/'

# Set a global seed
# set_global_seed(0)

with tf.Session() as sess:
    # Set agents
    global_step = tf.Variable(0, name='global_step', trainable=False)
    agent = DQNAgent(sess,
                     scope='dqn',
                     action_num=env.action_num,
                     replay_memory_size=int(1e5),
                     replay_memory_init_size=memory_init_size,
                     norm_step=norm_step,
                     state_shape=env.state_shape,
                     mlp_layers=[512, 512])

    random_agent = RandomAgent(action_num=eval_env.action_num)

    sess.run(tf.global_variables_initializer())

    env.set_agents([agent, random_agent, random_agent, random_agent, random_agent])
    eval_env.set_agents([agent, random_agent, random_agent, random_agent, random_agent])

    # Count the number of steps
    step_counter = 0

    # Init a Logger to plot the learning curve
Exemple #19
0
class NFSPAgent(object):
    ''' NFSP Agent implementation in TensorFlow.
    '''
    def __init__(self,
                 sess,
                 scope,
                 action_num=4,
                 state_shape=None,
                 hidden_layers_sizes=None,
                 reservoir_buffer_capacity=int(1e6),
                 anticipatory_param=0.5,
                 batch_size=256,
                 rl_learning_rate=0.0001,
                 sl_learning_rate=0.00001,
                 min_buffer_size_to_learn=1000,
                 q_replay_memory_size=30000,
                 q_replay_memory_init_size=1000,
                 q_update_target_estimator_every=1000,
                 q_discount_factor=0.99,
                 q_epsilon_start=1,
                 q_epsilon_end=0.1,
                 q_epsilon_decay_steps=int(1e6),
                 q_batch_size=256,
                 q_norm_step=1000,
                 q_mlp_layers=None):
        ''' Initialize the NFSP agent.

        Args:
            sess (tf.Session): Tensorflow session object.
            scope (string): The name scope of NFSPAgent.
            action_num (int): The number of actions.
            state_shape (list): The shape of the state space.
            hidden_layers_sizes (list): The hidden layers sizes for the layers of
              the average policy.
            reservoir_buffer_capacity (int): The size of the buffer for average policy.
            anticipatory_param (float): The hyper-parameter that balances rl/avarage policy.
            batch_size (int): The batch_size for training average policy.
            rl_learning_rate (float): The learning rate of the RL agent.
            sl_learning_rate (float): the learning rate of the average policy.
            min_buffer_size_to_learn (int): The minimum buffer size to learn for average policy.
            q_replay_memory_size (int): The memory size of inner DQN agent.
            q_replay_memory_init_size (int): The initial memory size of inner DQN agent.
            q_update_target_estimator_every (int): The frequency of updating target network for
              inner DQN agent.
            q_discount_factor (float): The discount factor of inner DQN agent.
            q_epsilon_start (float): The starting epsilon of inner DQN agent.
            q_epsilon_end (float): the end epsilon of inner DQN agent.
            q_epsilon_decay_steps (int): The decay steps of inner DQN agent.
            q_batch_size (int): The batch size of inner DQN agent.
            q_norm_step (int): The normalization steps of inner DQN agent.
            q_mlp_layers (list): The layer sizes of inner DQN agent.

        '''
        self._sess = sess
        self._action_num = action_num
        self._state_shape = state_shape
        self._layer_sizes = hidden_layers_sizes + [action_num]
        self._batch_size = batch_size
        self._sl_learning_rate = sl_learning_rate
        self._anticipatory_param = anticipatory_param
        self._min_buffer_size_to_learn = min_buffer_size_to_learn

        self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity)
        self._prev_timestep = None
        self._prev_action = None

        # Step counter to keep track of learning.
        self._step_counter = 0

        with tf.variable_scope(scope):
            # Inner RL agent
            self._rl_agent = DQNAgent(
                sess, 'dqn', q_replay_memory_size, q_replay_memory_init_size,
                q_update_target_estimator_every, q_discount_factor,
                q_epsilon_start, q_epsilon_end, q_epsilon_decay_steps,
                q_batch_size, action_num, state_shape, q_norm_step,
                q_mlp_layers, rl_learning_rate)

            # Build supervised model
            self._build_model()

        self.sample_episode_policy()

    def _build_model(self):
        ''' build the model for supervised learning
        '''
        # Placeholders.
        input_shape = [None]
        input_shape.extend(self._state_shape)
        self._info_state_ph = tf.placeholder(shape=input_shape,
                                             dtype=tf.float32)

        self._X = tf.contrib.layers.flatten(self._info_state_ph)

        self._action_probs_ph = tf.placeholder(shape=[None, self._action_num],
                                               dtype=tf.float32)

        # Average policy network.
        self._avg_network = snt.nets.MLP(output_sizes=self._layer_sizes)
        self._avg_policy = self._avg_network(self._X)
        self._avg_policy_probs = tf.nn.softmax(self._avg_policy)

        # Loss
        self._loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits_v2(
                labels=tf.stop_gradient(self._action_probs_ph),
                logits=self._avg_policy))

        optimizer = tf.train.AdamOptimizer(
            learning_rate=self._sl_learning_rate, name='nfsp_adam')

        self._learn_step = optimizer.minimize(self._loss)

    def feed(self, ts):
        ''' Feed data to inner RL agent

        Args:
            ts (list): A list of 5 elements that represent the transition.
        '''
        self._rl_agent.feed(ts)

    def step(self, state):
        ''' Returns the action to be taken.

        Args:
            state (dict): The current state

        Returns:
            action (int): An action id
        '''
        obs = state['obs']
        legal_actions = state['legal_actions']
        if self._mode == MODE.best_response:
            probs = self._rl_agent.predict(obs)
            self._add_transition(obs, probs)

        elif self._mode == MODE.average_policy:
            probs = self._act(obs)

        probs = remove_illegal(probs, legal_actions)
        action = np.random.choice(len(probs), p=probs)

        return action

    def eval_step(self, state):
        ''' Use the average policy for evaluation purpose

        Args:
            state (dict): The current state.

        Returns:
            action (int): An action id.
        '''
        action = self._rl_agent.eval_step(state)

        return action

    def sample_episode_policy(self):
        ''' Sample average/best_response policy
        '''
        if np.random.rand() < self._anticipatory_param:
            self._mode = MODE.best_response
        else:
            self._mode = MODE.average_policy

    def _act(self, info_state):
        ''' Predict action probability givin the observation and legal actions

        Args:
            info_state (numpy.array): An obervation.

        Returns:
            action_probs (numpy.array): The predicted action probability.
        '''
        info_state = np.expand_dims(info_state, axis=0)
        action_probs = self._sess.run(
            self._avg_policy_probs,
            feed_dict={self._info_state_ph: info_state})[0]

        return action_probs

    def _add_transition(self, state, probs):
        ''' Adds the new transition to the reservoir buffer.

        Transitions are in the form (state, probs).

        Args:
            state (numpy.array): The state.
            probs (numpy.array): The probabilities of each action.
        '''
        #print(len(self._reservoir_buffer))
        transition = Transition(info_state=state, action_probs=probs)
        self._reservoir_buffer.add(transition)

    def train_rl(self):
        ''' Update the inner RL agent
        '''
        return self._rl_agent.train()

    def train_sl(self):
        ''' Compute the loss on sampled transitions and perform a avg-network update.

        If there are not enough elements in the buffer, no loss is computed and
        `None` is returned instead.

        Returns:
            loss (float): The average loss obtained on this batch of transitions or `None`.
        '''
        if (len(self._reservoir_buffer) < self._batch_size or
                len(self._reservoir_buffer) < self._min_buffer_size_to_learn):
            return None

        transitions = self._reservoir_buffer.sample(self._batch_size)
        info_states = [t.info_state for t in transitions]
        action_probs = [t.action_probs for t in transitions]

        loss, _ = self._sess.run(
            [self._loss, self._learn_step],
            feed_dict={
                self._info_state_ph: info_states,
                self._action_probs_ph: action_probs,
            })

        return loss
Exemple #20
0
model_path = 'rlcard/models/pretrained/self_played_{}/tarot_v{}/model'.format(
    str(record_number), str(record_number * 10000 + self_play))

# Set a global seed
set_global_seed(0)

random_agent = RandomAgent(action_num=eval_env.action_num)

with tf.compat.v1.Session() as sess:
    # Set agents
    global_step = tf.Variable(0, name='global_step', trainable=False)
    agent = DQNAgent(
        sess,
        scope='dqn',
        action_num=78,  # env.action_num,
        replay_memory_size=20000,
        replay_memory_init_size=memory_init_size,
        norm_step=norm_step,
        state_shape=env.state_shape,
        mlp_layers=[512, 1024, 512])

    opponent_agent = agent

    sess.run(tf.compat.v1.global_variables_initializer())

    saver = tf.compat.v1.train.Saver()

    env.set_agents([agent] + [opponent_agent] * (env.player_num - 1))
    eval_env.set_agents([agent] + [random_agent] * (env.player_num - 1))

    # Count the number of steps
Exemple #21
0
evaluate_num = 1000
episode_num = 1000000

# Set the the number of steps for collecting normalization statistics
# and intial memory size
memory_init_size = 100
norm_step = 100

# Set a global seed
set_global_seed(1)

with tf.Session() as sess:
    # Set agents
    agent = DQNAgent(sess,
                       action_num=env.action_num,
                       replay_memory_init_size=memory_init_size,
                       norm_step=norm_step,
                       state_shape=[2],
                       mlp_layers=[10,10])
    env.set_agents([agent])

    # Count the number of steps
    step_counter = 0

    # Init a Logger to plot the learning curve
    logger = Logger(xlabel='eposide', ylabel='reward', legend='DQN on Blackjack', log_path='./experiments/blackjack_dqn_result/log.txt', csv_path='./experiments/blackjack_dqn_result/performance.csv')

    for episode in range(episode_num):

        # Generate data from the environment
        trajectories, _ = env.run(is_training=True)
Exemple #22
0
    def __init__(self,
                 sess,
                 scope,
                 action_num=4,
                 state_shape=None,
                 hidden_layers_sizes=None,
                 reservoir_buffer_capacity=int(1e6),
                 anticipatory_param=0.1,
                 batch_size=256,
                 train_every=1,
                 rl_learning_rate=0.1,
                 sl_learning_rate=0.005,
                 min_buffer_size_to_learn=1000,
                 q_replay_memory_size=30000,
                 q_replay_memory_init_size=1000,
                 q_update_target_estimator_every=1000,
                 q_discount_factor=0.99,
                 q_epsilon_start=0.06,
                 q_epsilon_end=0,
                 q_epsilon_decay_steps=int(1e6),
                 q_batch_size=256,
                 q_train_every=1,
                 q_mlp_layers=None,
                 evaluate_with='average_policy'):
        ''' Initialize the NFSP agent.

        Args:
            sess (tf.Session): Tensorflow session object.
            scope (string): The name scope of NFSPAgent.
            action_num (int): The number of actions.
            state_shape (list): The shape of the state space.
            hidden_layers_sizes (list): The hidden layers sizes for the layers of
              the average policy.
            reservoir_buffer_capacity (int): The size of the buffer for average policy.
            anticipatory_param (float): The hyper-parameter that balances rl/avarage policy.
            batch_size (int): The batch_size for training average policy.
            train_every (int): Train the SL policy every X steps.
            rl_learning_rate (float): The learning rate of the RL agent.
            sl_learning_rate (float): the learning rate of the average policy.
            min_buffer_size_to_learn (int): The minimum buffer size to learn for average policy.
            q_replay_memory_size (int): The memory size of inner DQN agent.
            q_replay_memory_init_size (int): The initial memory size of inner DQN agent.
            q_update_target_estimator_every (int): The frequency of updating target network for
              inner DQN agent.
            q_discount_factor (float): The discount factor of inner DQN agent.
            q_epsilon_start (float): The starting epsilon of inner DQN agent.
            q_epsilon_end (float): the end epsilon of inner DQN agent.
            q_epsilon_decay_steps (int): The decay steps of inner DQN agent.
            q_batch_size (int): The batch size of inner DQN agent.
            q_train_step (int): Train the model every X steps.
            q_mlp_layers (list): The layer sizes of inner DQN agent.
            evaluate_with (string): The value can be 'best_response' or 'average_policy'
        '''
        self.use_raw = False
        self._sess = sess
        self._scope = scope
        self._action_num = action_num
        self._state_shape = state_shape
        self._layer_sizes = hidden_layers_sizes
        self._batch_size = batch_size
        self._train_every = train_every
        self._sl_learning_rate = sl_learning_rate
        self._anticipatory_param = anticipatory_param
        self._min_buffer_size_to_learn = min_buffer_size_to_learn

        self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity)
        self._prev_timestep = None
        self._prev_action = None
        self.evaluate_with = evaluate_with

        # Total timesteps
        self.total_t = 0

        # Step counter to keep track of learning.
        self._step_counter = 0

        with tf.compat.v1.variable_scope(scope):
            # Inner RL agent
            self._rl_agent = DQNAgent(
                sess, scope + '_dqn', q_replay_memory_size,
                q_replay_memory_init_size, q_update_target_estimator_every,
                q_discount_factor, q_epsilon_start, q_epsilon_end,
                q_epsilon_decay_steps, q_batch_size, action_num, state_shape,
                q_train_every, q_mlp_layers, rl_learning_rate)

            with tf.compat.v1.variable_scope('sl'):
                # Build supervised model
                self._build_model()

        self.sample_episode_policy()
Exemple #23
0
class NFSPAgent(object):
    ''' An approximate clone of rlcard.agents.nfsp_agent that uses
    pytorch instead of tensorflow.  Note that this implementation
    differs from Henrich and Silver (2016) in that the supervised
    training minimizes cross-entropy with respect to the stored
    action probabilities rather than the realized actions.
    '''
    def __init__(self,
                 num_actions=4,
                 state_shape=None,
                 hidden_layers_sizes=None,
                 reservoir_buffer_capacity=20000,
                 anticipatory_param=0.1,
                 batch_size=256,
                 train_every=1,
                 rl_learning_rate=0.1,
                 sl_learning_rate=0.005,
                 min_buffer_size_to_learn=100,
                 q_replay_memory_size=20000,
                 q_replay_memory_init_size=100,
                 q_update_target_estimator_every=1000,
                 q_discount_factor=0.99,
                 q_epsilon_start=0.06,
                 q_epsilon_end=0,
                 q_epsilon_decay_steps=int(1e6),
                 q_batch_size=32,
                 q_train_every=1,
                 q_mlp_layers=None,
                 evaluate_with='average_policy',
                 device=None):
        ''' Initialize the NFSP agent.

        Args:
            num_actions (int): The number of actions.
            state_shape (list): The shape of the state space.
            hidden_layers_sizes (list): The hidden layers sizes for the layers of
              the average policy.
            reservoir_buffer_capacity (int): The size of the buffer for average policy.
            anticipatory_param (float): The hyper-parameter that balances rl/avarage policy.
            batch_size (int): The batch_size for training average policy.
            train_every (int): Train the SL policy every X steps.
            rl_learning_rate (float): The learning rate of the RL agent.
            sl_learning_rate (float): the learning rate of the average policy.
            min_buffer_size_to_learn (int): The minimum buffer size to learn for average policy.
            q_replay_memory_size (int): The memory size of inner DQN agent.
            q_replay_memory_init_size (int): The initial memory size of inner DQN agent.
            q_update_target_estimator_every (int): The frequency of updating target network for
              inner DQN agent.
            q_discount_factor (float): The discount factor of inner DQN agent.
            q_epsilon_start (float): The starting epsilon of inner DQN agent.
            q_epsilon_end (float): the end epsilon of inner DQN agent.
            q_epsilon_decay_steps (int): The decay steps of inner DQN agent.
            q_batch_size (int): The batch size of inner DQN agent.
            q_train_step (int): Train the model every X steps.
            q_mlp_layers (list): The layer sizes of inner DQN agent.
            device (torch.device): Whether to use the cpu or gpu
        '''
        self.use_raw = False
        self._num_actions = num_actions
        self._state_shape = state_shape
        self._layer_sizes = hidden_layers_sizes + [num_actions]
        self._batch_size = batch_size
        self._train_every = train_every
        self._sl_learning_rate = sl_learning_rate
        self._anticipatory_param = anticipatory_param
        self._min_buffer_size_to_learn = min_buffer_size_to_learn

        self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity)
        self._prev_timestep = None
        self._prev_action = None
        self.evaluate_with = evaluate_with

        if device is None:
            self.device = torch.device(
                'cuda:0' if torch.cuda.is_available() else 'cpu')
        else:
            self.device = device

        # Total timesteps
        self.total_t = 0

        # Step counter to keep track of learning.
        self._step_counter = 0

        # Build the action-value network
        self._rl_agent = DQNAgent(q_replay_memory_size, q_replay_memory_init_size, \
            q_update_target_estimator_every, q_discount_factor, q_epsilon_start, q_epsilon_end, \
            q_epsilon_decay_steps, q_batch_size, num_actions, state_shape, q_train_every, q_mlp_layers, \
            rl_learning_rate, device)

        # Build the average policy supervised model
        self._build_model()

        self.sample_episode_policy()

    def _build_model(self):
        ''' Build the average policy network
        '''

        # configure the average policy network
        policy_network = AveragePolicyNetwork(self._num_actions,
                                              self._state_shape,
                                              self._layer_sizes)
        policy_network = policy_network.to(self.device)
        self.policy_network = policy_network
        self.policy_network.eval()

        # xavier init
        for p in self.policy_network.parameters():
            if len(p.data.shape) > 1:
                nn.init.xavier_uniform_(p.data)

        # configure optimizer
        self.policy_network_optimizer = torch.optim.Adam(
            self.policy_network.parameters(), lr=self._sl_learning_rate)

    def feed(self, ts):
        ''' Feed data to inner RL agent

        Args:
            ts (list): A list of 5 elements that represent the transition.
        '''
        self._rl_agent.feed(ts)
        self.total_t += 1
        if self.total_t > 0 and len(
                self._reservoir_buffer
        ) >= self._min_buffer_size_to_learn and self.total_t % self._train_every == 0:
            sl_loss = self.train_sl()
            print('\rINFO - Step {}, sl-loss: {}'.format(
                self.total_t, sl_loss),
                  end='')

    def step(self, state):
        ''' Returns the action to be taken.

        Args:
            state (dict): The current state

        Returns:
            action (int): An action id
        '''
        obs = state['obs']
        legal_actions = list(state['legal_actions'].keys())
        if self._mode == 'best_response':
            action = self._rl_agent.step(state)
            one_hot = np.zeros(self._num_actions)
            one_hot[action] = 1
            self._add_transition(obs, one_hot)

        elif self._mode == 'average_policy':
            probs = self._act(obs)
            probs = remove_illegal(probs, legal_actions)
            action = np.random.choice(len(probs), p=probs)

        return action

    def eval_step(self, state):
        ''' Use the average policy for evaluation purpose

        Args:
            state (dict): The current state.

        Returns:
            action (int): An action id.
            info (dict): A dictionary containing information
        '''
        if self.evaluate_with == 'best_response':
            action, info = self._rl_agent.eval_step(state)
        elif self.evaluate_with == 'average_policy':
            obs = state['obs']
            legal_actions = list(state['legal_actions'].keys())
            probs = self._act(obs)
            probs = remove_illegal(probs, legal_actions)
            action = np.random.choice(len(probs), p=probs)
            info = {}
            info['probs'] = {
                state['raw_legal_actions'][i]:
                float(probs[list(state['legal_actions'].keys())[i]])
                for i in range(len(state['legal_actions']))
            }
        else:
            raise ValueError(
                "'evaluate_with' should be either 'average_policy' or 'best_response'."
            )
        return action, info

    def sample_episode_policy(self):
        ''' Sample average/best_response policy
        '''
        if np.random.rand() < self._anticipatory_param:
            self._mode = 'best_response'
        else:
            self._mode = 'average_policy'

    def _act(self, info_state):
        ''' Predict action probability givin the observation and legal actions
            Not connected to computation graph
        Args:
            info_state (numpy.array): An obervation.

        Returns:
            action_probs (numpy.array): The predicted action probability.
        '''
        info_state = np.expand_dims(info_state, axis=0)
        info_state = torch.from_numpy(info_state).float().to(self.device)

        with torch.no_grad():
            log_action_probs = self.policy_network(info_state).cpu().numpy()

        action_probs = np.exp(log_action_probs)[0]

        return action_probs

    def _add_transition(self, state, probs):
        ''' Adds the new transition to the reservoir buffer.

        Transitions are in the form (state, probs).

        Args:
            state (numpy.array): The state.
            probs (numpy.array): The probabilities of each action.
        '''
        transition = Transition(info_state=state, action_probs=probs)
        self._reservoir_buffer.add(transition)

    def train_sl(self):
        ''' Compute the loss on sampled transitions and perform a avg-network update.

        If there are not enough elements in the buffer, no loss is computed and
        `None` is returned instead.

        Returns:
            loss (float): The average loss obtained on this batch of transitions or `None`.
        '''
        if (len(self._reservoir_buffer) < self._batch_size or
                len(self._reservoir_buffer) < self._min_buffer_size_to_learn):
            return None

        transitions = self._reservoir_buffer.sample(self._batch_size)
        info_states = [t.info_state for t in transitions]
        action_probs = [t.action_probs for t in transitions]

        self.policy_network_optimizer.zero_grad()
        self.policy_network.train()

        # (batch, state_size)
        info_states = torch.from_numpy(np.array(info_states)).float().to(
            self.device)

        # (batch, num_actions)
        eval_action_probs = torch.from_numpy(
            np.array(action_probs)).float().to(self.device)

        # (batch, num_actions)
        log_forecast_action_probs = self.policy_network(info_states)

        ce_loss = -(eval_action_probs *
                    log_forecast_action_probs).sum(dim=-1).mean()
        ce_loss.backward()

        self.policy_network_optimizer.step()
        ce_loss = ce_loss.item()
        self.policy_network.eval()

        return ce_loss

    def set_device(self, device):
        self.device = device
        self._rl_agent.set_device(device)
Exemple #24
0
# The paths for saving the logs and learning curves
log_dir = './experiments/uno_single_dqn_result/'

# Set a global seed
set_global_seed(0)

with tf.Session() as sess:

    # Initialize a global step
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Set up the agents
    agent = DQNAgent(sess,
                     scope='dqn',
                     action_num=env.action_num,
                     replay_memory_init_size=memory_init_size,
                     train_every=train_every,
                     state_shape=env.state_shape,
                     mlp_layers=[128, 128])
    # Initialize global variables
    sess.run(tf.global_variables_initializer())

    # Init a Logger to plot the learning curve
    logger = Logger(log_dir)

    state = env.reset()

    for timestep in range(timesteps):
        action = agent.step(state)
        next_state, reward, done = env.step(action)
        ts = (state, action, reward, next_state, done)
Exemple #25
0
class NFSPAgent(object):
    ''' NFSP Agent implementation in TensorFlow.
    '''
    def __init__(self,
                 sess,
                 scope,
                 action_num=4,
                 state_shape=None,
                 hidden_layers_sizes=None,
                 reservoir_buffer_capacity=int(1e6),
                 anticipatory_param=0.1,
                 batch_size=256,
                 train_every=1,
                 rl_learning_rate=0.1,
                 sl_learning_rate=0.005,
                 min_buffer_size_to_learn=1000,
                 q_replay_memory_size=30000,
                 q_replay_memory_init_size=1000,
                 q_update_target_estimator_every=1000,
                 q_discount_factor=0.99,
                 q_epsilon_start=0.06,
                 q_epsilon_end=0,
                 q_epsilon_decay_steps=int(1e6),
                 q_batch_size=256,
                 q_train_every=1,
                 q_mlp_layers=None,
                 evaluate_with='average_policy'):
        ''' Initialize the NFSP agent.

        Args:
            sess (tf.Session): Tensorflow session object.
            scope (string): The name scope of NFSPAgent.
            action_num (int): The number of actions.
            state_shape (list): The shape of the state space.
            hidden_layers_sizes (list): The hidden layers sizes for the layers of
              the average policy.
            reservoir_buffer_capacity (int): The size of the buffer for average policy.
            anticipatory_param (float): The hyper-parameter that balances rl/avarage policy.
            batch_size (int): The batch_size for training average policy.
            train_every (int): Train the SL policy every X steps.
            rl_learning_rate (float): The learning rate of the RL agent.
            sl_learning_rate (float): the learning rate of the average policy.
            min_buffer_size_to_learn (int): The minimum buffer size to learn for average policy.
            q_replay_memory_size (int): The memory size of inner DQN agent.
            q_replay_memory_init_size (int): The initial memory size of inner DQN agent.
            q_update_target_estimator_every (int): The frequency of updating target network for
              inner DQN agent.
            q_discount_factor (float): The discount factor of inner DQN agent.
            q_epsilon_start (float): The starting epsilon of inner DQN agent.
            q_epsilon_end (float): the end epsilon of inner DQN agent.
            q_epsilon_decay_steps (int): The decay steps of inner DQN agent.
            q_batch_size (int): The batch size of inner DQN agent.
            q_train_step (int): Train the model every X steps.
            q_mlp_layers (list): The layer sizes of inner DQN agent.
            evaluate_with (string): The value can be 'best_response' or 'average_policy'
        '''
        self.use_raw = False
        self._sess = sess
        self._scope = scope
        self._action_num = action_num
        self._state_shape = state_shape
        self._layer_sizes = hidden_layers_sizes
        self._batch_size = batch_size
        self._train_every = train_every
        self._sl_learning_rate = sl_learning_rate
        self._anticipatory_param = anticipatory_param
        self._min_buffer_size_to_learn = min_buffer_size_to_learn

        self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity)
        self._prev_timestep = None
        self._prev_action = None
        self.evaluate_with = evaluate_with

        # Total timesteps
        self.total_t = 0

        # Step counter to keep track of learning.
        self._step_counter = 0

        with tf.compat.v1.variable_scope(scope):
            # Inner RL agent
            self._rl_agent = DQNAgent(
                sess, scope + '_dqn', q_replay_memory_size,
                q_replay_memory_init_size, q_update_target_estimator_every,
                q_discount_factor, q_epsilon_start, q_epsilon_end,
                q_epsilon_decay_steps, q_batch_size, action_num, state_shape,
                q_train_every, q_mlp_layers, rl_learning_rate)

            with tf.compat.v1.variable_scope('sl'):
                # Build supervised model
                self._build_model()

        self.sample_episode_policy()

    def _build_model(self):
        ''' build the model for supervised learning
        '''
        # Placeholders.
        input_shape = [None]
        input_shape.extend(self._state_shape)
        self._info_state_ph = tf.compat.v1.placeholder(shape=input_shape,
                                                       dtype=tf.float32)

        self._X = tf.keras.layers.Flatten(self._info_state_ph)

        # Boolean to indicate whether is training or not
        self.is_train = tf.compat.v1.placeholder(tf.bool, name="is_train")

        # Batch Normalization
        self._X = tf.compat.v1.layers.batch_normalization(self._X,
                                                          training=True)

        self._action_probs_ph = tf.compat.v1.placeholder(
            shape=[None, self._action_num], dtype=tf.float32)

        # Average policy network.
        fc = self._X
        # to be fixed
        for dim in self._layer_sizes:
            fc.add(tf.keras.layers.Dense(dim, activation=tf.tanh))
        self._avg_policy = tf.keras.layers.Dense(self._action_num,
                                                 activation=None)
        self._avg_policy_probs = tf.nn.softmax(self._avg_policy)

        # Loss
        self._loss = tf.reduce_mean(
            input_tensor=tf.nn.softmax_cross_entropy_with_logits(
                labels=tf.stop_gradient(self._action_probs_ph),
                logits=self._avg_policy))

        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=self._sl_learning_rate, name='nfsp_adam')

        update_ops = tf.compat.v1.get_collection(
            tf.compat.v1.GraphKeys.UPDATE_OPS,
            scope=tf.compat.v1.get_variable_scope().name)
        with tf.control_dependencies(update_ops):
            self._learn_step = optimizer.minimize(self._loss)

    def feed(self, ts):
        ''' Feed data to inner RL agent

        Args:
            ts (list): A list of 5 elements that represent the transition.
        '''
        self._rl_agent.feed(ts)
        self.total_t += 1
        if self.total_t > 0 and len(
                self._reservoir_buffer
        ) >= self._min_buffer_size_to_learn and self.total_t % self._train_every == 0:
            sl_loss = self.train_sl()
            print('\rINFO - Agent {}, step {}, sl-loss: {}'.format(
                self._scope, self.total_t, sl_loss),
                  end='')

    def step(self, state):
        ''' Returns the action to be taken.

        Args:
            state (dict): The current state

        Returns:
            action (int): An action id
        '''
        obs = state['obs']
        legal_actions = state['legal_actions']
        if self._mode == MODE.best_response:
            probs = self._rl_agent.predict(obs)
            one_hot = np.eye(len(probs))[np.argmax(probs)]
            self._add_transition(obs, one_hot)

        elif self._mode == MODE.average_policy:
            probs = self._act(obs)

        probs = remove_illegal(probs, legal_actions)
        action = np.random.choice(len(probs), p=probs)

        return action

    def eval_step(self, state):
        ''' Use the average policy for evaluation purpose

        Args:
            state (dict): The current state.

        Returns:
            action (int): An action id.
            probs (list): The list of action probabilies
        '''
        if self.evaluate_with == 'best_response':
            action, probs = self._rl_agent.eval_step(state)
        elif self.evaluate_with == 'average_policy':
            obs = state['obs']
            legal_actions = state['legal_actions']
            probs = self._act(obs)
            probs = remove_illegal(probs, legal_actions)
            action = np.random.choice(len(probs), p=probs)
        else:
            raise ValueError(
                "'evaluate_with' should be either 'average_policy' or 'best_response'."
            )
        return action, probs

    def sample_episode_policy(self):
        ''' Sample average/best_response policy
        '''
        if np.random.rand() < self._anticipatory_param:
            self._mode = MODE.best_response
        else:
            self._mode = MODE.average_policy

    def _act(self, info_state):
        ''' Predict action probability givin the observation and legal actions

        Args:
            info_state (numpy.array): An obervation.

        Returns:
            action_probs (numpy.array): The predicted action probability.
        '''
        info_state = np.expand_dims(info_state, axis=0)
        action_probs = self._sess.run(self._avg_policy_probs,
                                      feed_dict={
                                          self._info_state_ph: info_state,
                                          self.is_train: False
                                      })[0]

        return action_probs

    def _add_transition(self, state, probs):
        ''' Adds the new transition to the reservoir buffer.

        Transitions are in the form (state, probs).

        Args:
            state (numpy.array): The state.
            probs (numpy.array): The probabilities of each action.
        '''
        transition = Transition(info_state=state, action_probs=probs)
        self._reservoir_buffer.add(transition)

    def train_sl(self):
        ''' Compute the loss on sampled transitions and perform a avg-network update.

        If there are not enough elements in the buffer, no loss is computed and
        `None` is returned instead.

        Returns:
            loss (float): The average loss obtained on this batch of transitions or `None`.
        '''
        if (len(self._reservoir_buffer) < self._batch_size or
                len(self._reservoir_buffer) < self._min_buffer_size_to_learn):
            return None

        transitions = self._reservoir_buffer.sample(self._batch_size)
        info_states = [t.info_state for t in transitions]
        action_probs = [t.action_probs for t in transitions]

        loss, _ = self._sess.run(
            [self._loss, self._learn_step],
            feed_dict={
                self._info_state_ph: info_states,
                self._action_probs_ph: action_probs,
                self.is_train: True,
            })

        return loss