def __init__(self): self.name = 'PreDQNAgent' self.id = "d" # Set up the DQN agent and load the pre-trained model self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph) self.use_raw = False # Config conf = Config('environ.properties') # Set the the number of steps for collecting normalization statistics # and intial memory size memory_init_size = conf.get_int('memory_init_size') norm_step = conf.get_int('norm_step') env = rlcard3.make('mocsar_dqn') with self.graph.as_default(): self.agent = DQNAgent(self.sess, scope='dqn', action_num=env.action_num, state_shape=env.state_shape, replay_memory_size=20000, replay_memory_init_size=memory_init_size, norm_step=norm_step, mlp_layers=[512, 512]) self.normalize(env, 1000) self.sess.run(tf.compat.v1.global_variables_initializer()) check_point_path = os.path.join(ROOT_PATH, 'mocsar_dqn') with self.sess.as_default(): with self.graph.as_default(): saver = tf.train.Saver(tf.model_variables()) saver.restore(self.sess, tf.train.latest_checkpoint(check_point_path))
def test_init(self): sess = tf.InteractiveSession() tf.Variable(0, name='global_step', trainable=False) agent = DQNAgent(sess=sess, scope='dqn', replay_memory_size=0, replay_memory_init_size=0, update_target_estimator_every=0, discount_factor=0, epsilon_start=0, epsilon_end=0, epsilon_decay_steps=0, batch_size=0, action_num=2, state_shape=[1], mlp_layers=[10, 10]) self.assertEqual(agent.replay_memory_init_size, 0) self.assertEqual(agent.update_target_estimator_every, 0) self.assertEqual(agent.discount_factor, 0) self.assertEqual(agent.epsilon_decay_steps, 0) self.assertEqual(agent.batch_size, 0) self.assertEqual(agent.action_num, 2) sess.close() tf.reset_default_graph()
def run(self): #import tensorflow as tf self.env = rlcard3.make('blackjack') self.sess = tf.Session() agent = DQNAgent(self.sess, scope='sub-dqn' + str(self.index), action_num=self.env.action_num, replay_memory_init_size=memory_init_size, state_shape=self.env.state_shape, mlp_layers=[10, 10]) self.env.set_agents([agent]) self.sess.run(tf.global_variables_initializer()) # normalize for _ in range(norm_step): trajectories, _ = self.env.run() for ts in trajectories[0]: agent.feed(ts) # Receive instruction to run game and generate trajectories while True: instruction = self.input_queue.get() if instruction is not None: tasks, train_flag, variables, total_t = instruction # For evaluation if not train_flag: agent.total_t = total_t global_vars = [tf.convert_to_tensor(var) for var in variables] agent.copy_params_op(global_vars) for _ in range(tasks): _, payoffs = self.env.run(is_training=train_flag) self.output_queue.put(payoffs) # For training else: for _ in range(tasks): trajectories, _ = self.env.run(is_training=train_flag) self.output_queue.put(trajectories) self.input_queue.task_done() else: self.input_queue.task_done() break self.sess.close() return
def test_train(self): memory_init_size = 100 step_num = 1500 sess = tf.InteractiveSession() tf.Variable(0, name='global_step', trainable=False) agent = DQNAgent(sess=sess, scope='dqn', replay_memory_size=500, replay_memory_init_size=memory_init_size, update_target_estimator_every=100, state_shape=[2], mlp_layers=[10, 10]) sess.run(tf.global_variables_initializer()) predicted_action, _ = agent.eval_step({ 'obs': np.random.random_sample((2, )), 'legal_actions': [0, 1] }) self.assertGreaterEqual(predicted_action, 0) self.assertLessEqual(predicted_action, 1) for _ in range(step_num): ts = [{ 'obs': np.random.random_sample((2, )), 'legal_actions': [0, 1] }, np.random.randint(2), 0, { 'obs': np.random.random_sample((2, )), 'legal_actions': [0, 1] }, True] agent.feed(ts) predicted_action = agent.step({ 'obs': np.random.random_sample((2, )), 'legal_actions': [0, 1] }) self.assertGreaterEqual(predicted_action, 0) self.assertLessEqual(predicted_action, 1) sess.close() tf.reset_default_graph()
# parameter variables evaluate_num, evaluate_every, memory_init_size, train_every, episode_num = init_vars( conf=conf) # The paths for saving the logs and learning curves log_dir = './experiments/mocsar_dqn_ra_result/' # Set a global seed set_global_seed(0) with tf.compat.v1.Session() as sess: # Set agents global_step = tf.Variable(0, name='global_step', trainable=False) agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[512, 512]) random_agent = RandomAgent(action_num=eval_env.action_num) sess.run(tf.compat.v1.global_variables_initializer()) # Other agents env.model.create_agents({"mocsar_min": 4}) env_agent_list = [env.model.rule_agents[i] for i in range(1, 4)] env_agent_list.insert(0, agent) env.set_agents(env_agent_list) # Evaluation agent
PROCESSES = [BlackjackProcess(index, INPUT_QUEUE, OUTPUT_QUEUE, np.random.randint(1000000)) for index in range(PROCESS_NUM)] for p in PROCESSES: p.start() # Make environment env = rlcard3.make('blackjack') eval_env = rlcard3.make('blackjack') with tf.Session() as sess: # Set agents global_step = tf.Variable(0, name='global_step', trainable=False) agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, state_shape=env.state_shape, mlp_layers=[10, 10]) env.set_agents([agent]) eval_env.set_agents([agent]) sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(xlabel='timestep', ylabel='reward', legend='DQN on Blackjack', log_path=log_path, csv_path=csv_path) for episode in range(episode_num // evaluate_every): # Generate data from the environment tasks = assign_task(evaluate_every, PROCESS_NUM) for task in tasks:
# The paths for saving the logs and learning curves log_dir = './experiments/uno_single_dqn_result/' # Set a global seed set_global_seed(0) with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[128, 128]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) state = env.reset() for timestep in range(timesteps): action = agent.step(state) next_state, reward, done = env.step(action) ts = (state, action, reward, next_state, done)
class MocsarPretrainddDqnAgent(Agent): """ Mocsar Rule agent version 1, take the minimal action """ name: str # Name of the agent id: str # ID of the Agent agent: DQNAgent # the pre-trained agent def __init__(self): self.name = 'PreDQNAgent' self.id = "d" # Set up the DQN agent and load the pre-trained model self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph) self.use_raw = False # Config conf = Config('environ.properties') # Set the the number of steps for collecting normalization statistics # and intial memory size memory_init_size = conf.get_int('memory_init_size') norm_step = conf.get_int('norm_step') env = rlcard3.make('mocsar_dqn') with self.graph.as_default(): self.agent = DQNAgent(self.sess, scope='dqn', action_num=env.action_num, state_shape=env.state_shape, replay_memory_size=20000, replay_memory_init_size=memory_init_size, norm_step=norm_step, mlp_layers=[512, 512]) self.normalize(env, 1000) self.sess.run(tf.compat.v1.global_variables_initializer()) check_point_path = os.path.join(ROOT_PATH, 'mocsar_dqn') with self.sess.as_default(): with self.graph.as_default(): saver = tf.train.Saver(tf.model_variables()) saver.restore(self.sess, tf.train.latest_checkpoint(check_point_path)) def __str__(self): return f"Agent:{self.name}" def step(self, state: Dict) -> str: """ Predict the action given raw state. A naive rule. Choose the minimal action. Args: state (dict): Raw state from the game Returns: action (str): Predicted action """ is_extract = state['is_extract'] action_ids = get_action_ids(legal_actions=state['legal_actions'], is_extracted=is_extract) if len(action_ids) == 1: # Ha nincs miből választani return action_to_ret(action_ids[0], is_extract) if not is_extract: obs = encode_to_obs(state=state) extracted_state = { 'obs': obs, 'legal_actions': [ string_to_action(action) for action in state['legal_actions'] ], 'is_extract': True # State is extracted } else: extracted_state = state action = self.agent.step(state=extracted_state) return action_to_ret(action=action, is_extracted=is_extract) def eval_step(self, state: Dict): """ Step for evaluation. The same to step """ return self.step(state), [] def normalize(self, e, num): """ Feed random data to normalizer Args: e (Env): AN Env class num (int): The number of steps to be normalized """ print('**********Normalize begin**************') begin_step = e.timestep e.set_agents([RandomAgent() for _ in range(e.player_num)]) while e.timestep - begin_step < num: trajectories, _ = e.run(is_training=False) for tra in trajectories: for ts in tra: self.agent.feed(ts) print('**********Normalize end**************')
def __init__(self, sess, scope, action_num=4, state_shape=None, hidden_layers_sizes=None, reservoir_buffer_capacity=int(1e6), anticipatory_param=0.1, batch_size=256, train_every=1, rl_learning_rate=0.1, sl_learning_rate=0.005, min_buffer_size_to_learn=1000, q_replay_memory_size=30000, q_replay_memory_init_size=1000, q_update_target_estimator_every=1000, q_discount_factor=0.99, q_epsilon_start=0.06, q_epsilon_end=0, q_epsilon_decay_steps=int(1e6), q_batch_size=256, q_train_every=1, q_mlp_layers=None, evaluate_with='average_policy'): ''' Initialize the NFSP agent. Args: sess (tf.Session): Tensorflow session object. scope (string): The name scope of NFSPAgent. action_num (int): The number of actions. state_shape (list): The shape of the state space. hidden_layers_sizes (list): The hidden layers sizes for the layers of the average policy. reservoir_buffer_capacity (int): The size of the buffer for average policy. anticipatory_param (float): The hyper-parameter that balances rl/avarage policy. batch_size (int): The batch_size for training average policy. train_every (int): Train the SL policy every X steps. rl_learning_rate (float): The learning rate of the RL agent. sl_learning_rate (float): the learning rate of the average policy. min_buffer_size_to_learn (int): The minimum buffer size to learn for average policy. q_replay_memory_size (int): The memory size of inner DQN agent. q_replay_memory_init_size (int): The initial memory size of inner DQN agent. q_update_target_estimator_every (int): The frequency of updating target network for inner DQN agent. q_discount_factor (float): The discount factor of inner DQN agent. q_epsilon_start (float): The starting epsilon of inner DQN agent. q_epsilon_end (float): the end epsilon of inner DQN agent. q_epsilon_decay_steps (int): The decay steps of inner DQN agent. q_batch_size (int): The batch size of inner DQN agent. q_train_step (int): Train the model every X steps. q_mlp_layers (list): The layer sizes of inner DQN agent. evaluate_with (string): The value can be 'best_response' or 'average_policy' ''' self.use_raw = False self._sess = sess self._scope = scope self._action_num = action_num self._state_shape = state_shape self._layer_sizes = hidden_layers_sizes self._batch_size = batch_size self._train_every = train_every self._sl_learning_rate = sl_learning_rate self._anticipatory_param = anticipatory_param self._min_buffer_size_to_learn = min_buffer_size_to_learn self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity) self._prev_timestep = None self._prev_action = None self.evaluate_with = evaluate_with # Total timesteps self.total_t = 0 # Step counter to keep track of learning. self._step_counter = 0 with tf.variable_scope(scope): # Inner RL agent self._rl_agent = DQNAgent( sess, scope + '_dqn', q_replay_memory_size, q_replay_memory_init_size, q_update_target_estimator_every, q_discount_factor, q_epsilon_start, q_epsilon_end, q_epsilon_decay_steps, q_batch_size, action_num, state_shape, q_train_every, q_mlp_layers, rl_learning_rate) with tf.variable_scope('sl'): # Build supervised model self._build_model() self.sample_episode_policy()
class NFSPAgent(object): ''' NFSP Agent implementation in TensorFlow. ''' def __init__(self, sess, scope, action_num=4, state_shape=None, hidden_layers_sizes=None, reservoir_buffer_capacity=int(1e6), anticipatory_param=0.1, batch_size=256, train_every=1, rl_learning_rate=0.1, sl_learning_rate=0.005, min_buffer_size_to_learn=1000, q_replay_memory_size=30000, q_replay_memory_init_size=1000, q_update_target_estimator_every=1000, q_discount_factor=0.99, q_epsilon_start=0.06, q_epsilon_end=0, q_epsilon_decay_steps=int(1e6), q_batch_size=256, q_train_every=1, q_mlp_layers=None, evaluate_with='average_policy'): ''' Initialize the NFSP agent. Args: sess (tf.Session): Tensorflow session object. scope (string): The name scope of NFSPAgent. action_num (int): The number of actions. state_shape (list): The shape of the state space. hidden_layers_sizes (list): The hidden layers sizes for the layers of the average policy. reservoir_buffer_capacity (int): The size of the buffer for average policy. anticipatory_param (float): The hyper-parameter that balances rl/avarage policy. batch_size (int): The batch_size for training average policy. train_every (int): Train the SL policy every X steps. rl_learning_rate (float): The learning rate of the RL agent. sl_learning_rate (float): the learning rate of the average policy. min_buffer_size_to_learn (int): The minimum buffer size to learn for average policy. q_replay_memory_size (int): The memory size of inner DQN agent. q_replay_memory_init_size (int): The initial memory size of inner DQN agent. q_update_target_estimator_every (int): The frequency of updating target network for inner DQN agent. q_discount_factor (float): The discount factor of inner DQN agent. q_epsilon_start (float): The starting epsilon of inner DQN agent. q_epsilon_end (float): the end epsilon of inner DQN agent. q_epsilon_decay_steps (int): The decay steps of inner DQN agent. q_batch_size (int): The batch size of inner DQN agent. q_train_step (int): Train the model every X steps. q_mlp_layers (list): The layer sizes of inner DQN agent. evaluate_with (string): The value can be 'best_response' or 'average_policy' ''' self.use_raw = False self._sess = sess self._scope = scope self._action_num = action_num self._state_shape = state_shape self._layer_sizes = hidden_layers_sizes self._batch_size = batch_size self._train_every = train_every self._sl_learning_rate = sl_learning_rate self._anticipatory_param = anticipatory_param self._min_buffer_size_to_learn = min_buffer_size_to_learn self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity) self._prev_timestep = None self._prev_action = None self.evaluate_with = evaluate_with # Total timesteps self.total_t = 0 # Step counter to keep track of learning. self._step_counter = 0 with tf.variable_scope(scope): # Inner RL agent self._rl_agent = DQNAgent( sess, scope + '_dqn', q_replay_memory_size, q_replay_memory_init_size, q_update_target_estimator_every, q_discount_factor, q_epsilon_start, q_epsilon_end, q_epsilon_decay_steps, q_batch_size, action_num, state_shape, q_train_every, q_mlp_layers, rl_learning_rate) with tf.variable_scope('sl'): # Build supervised model self._build_model() self.sample_episode_policy() def _build_model(self): ''' build the model for supervised learning ''' # Placeholders. input_shape = [None] input_shape.extend(self._state_shape) self._info_state_ph = tf.placeholder(shape=input_shape, dtype=tf.float32) self._X = tf.contrib.layers.flatten(self._info_state_ph) # Boolean to indicate whether is training or not self.is_train = tf.placeholder(tf.bool, name="is_train") # Batch Normalization self._X = tf.layers.batch_normalization(self._X, training=True) self._action_probs_ph = tf.placeholder(shape=[None, self._action_num], dtype=tf.float32) # Average policy network. fc = self._X for dim in self._layer_sizes: fc = tf.contrib.layers.fully_connected(fc, dim, activation_fn=tf.tanh) self._avg_policy = tf.contrib.layers.fully_connected( fc, self._action_num, activation_fn=None) self._avg_policy_probs = tf.nn.softmax(self._avg_policy) # Loss self._loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( labels=tf.stop_gradient(self._action_probs_ph), logits=self._avg_policy)) optimizer = tf.train.AdamOptimizer( learning_rate=self._sl_learning_rate, name='nfsp_adam') update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope=tf.get_variable_scope().name) with tf.control_dependencies(update_ops): self._learn_step = optimizer.minimize(self._loss) def feed(self, ts): ''' Feed data to inner RL agent Args: ts (list): A list of 5 elements that represent the transition. ''' self._rl_agent.feed(ts) self.total_t += 1 if self.total_t > 0 and len( self._reservoir_buffer ) >= self._min_buffer_size_to_learn and self.total_t % self._train_every == 0: sl_loss = self.train_sl() print('\rINFO - Agent {}, step {}, sl-loss: {}'.format( self._scope, self.total_t, sl_loss), end='') def step(self, state): ''' Returns the action to be taken. Args: state (dict): The current state Returns: action (int): An action id ''' obs = state['obs'] legal_actions = state['legal_actions'] if self._mode == MODE.best_response: probs = self._rl_agent.predict(obs) one_hot = np.eye(len(probs))[np.argmax(probs)] self._add_transition(obs, one_hot) elif self._mode == MODE.average_policy: probs = self._act(obs) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) return action def eval_step(self, state): ''' Use the average policy for evaluation purpose Args: state (dict): The current state. Returns: action (int): An action id. probs (list): The list of action probabilies ''' if self.evaluate_with == 'best_response': action, probs = self._rl_agent.eval_step(state) elif self.evaluate_with == 'average_policy': obs = state['obs'] legal_actions = state['legal_actions'] probs = self._act(obs) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) else: raise ValueError( "'evaluate_with' should be either 'average_policy' or 'best_response'." ) return action, probs def sample_episode_policy(self): ''' Sample average/best_response policy ''' if np.random.rand() < self._anticipatory_param: self._mode = MODE.best_response else: self._mode = MODE.average_policy def _act(self, info_state): ''' Predict action probability givin the observation and legal actions Args: info_state (numpy.array): An obervation. Returns: action_probs (numpy.array): The predicted action probability. ''' info_state = np.expand_dims(info_state, axis=0) action_probs = self._sess.run(self._avg_policy_probs, feed_dict={ self._info_state_ph: info_state, self.is_train: False })[0] return action_probs def _add_transition(self, state, probs): ''' Adds the new transition to the reservoir buffer. Transitions are in the form (state, probs). Args: state (numpy.array): The state. probs (numpy.array): The probabilities of each action. ''' transition = Transition(info_state=state, action_probs=probs) self._reservoir_buffer.add(transition) def train_sl(self): ''' Compute the loss on sampled transitions and perform a avg-network update. If there are not enough elements in the buffer, no loss is computed and `None` is returned instead. Returns: loss (float): The average loss obtained on this batch of transitions or `None`. ''' if (len(self._reservoir_buffer) < self._batch_size or len(self._reservoir_buffer) < self._min_buffer_size_to_learn): return None transitions = self._reservoir_buffer.sample(self._batch_size) info_states = [t.info_state for t in transitions] action_probs = [t.action_probs for t in transitions] loss, _ = self._sess.run( [self._loss, self._learn_step], feed_dict={ self._info_state_ph: info_states, self._action_probs_ph: action_probs, self.is_train: True, }) return loss