def test_train(self): memory_init_size = 100 step_num = 1500 sess = tf.InteractiveSession() tf.Variable(0, name='global_step', trainable=False) agent = DQNAgent(sess=sess, scope='dqn', replay_memory_size=500, replay_memory_init_size=memory_init_size, update_target_estimator_every=100, state_shape=[2], mlp_layers=[10, 10]) sess.run(tf.global_variables_initializer()) predicted_action, _ = agent.eval_step({ 'obs': np.random.random_sample((2, )), 'legal_actions': [0, 1] }) self.assertGreaterEqual(predicted_action, 0) self.assertLessEqual(predicted_action, 1) for _ in range(step_num): ts = [{ 'obs': np.random.random_sample((2, )), 'legal_actions': [0, 1] }, np.random.randint(2), 0, { 'obs': np.random.random_sample((2, )), 'legal_actions': [0, 1] }, True] agent.feed(ts) predicted_action = agent.step({ 'obs': np.random.random_sample((2, )), 'legal_actions': [0, 1] }) self.assertGreaterEqual(predicted_action, 0) self.assertLessEqual(predicted_action, 1) sess.close() tf.reset_default_graph()
scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[128, 128]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) state = env.reset() for timestep in range(timesteps): action = agent.step(state) next_state, reward, done = env.step(action) ts = (state, action, reward, next_state, done) agent.feed(ts) if timestep % evaluate_every == 0: rewards = [] state = eval_env.reset() for _ in range(evaluate_num): action, _ = agent.eval_step(state) _, reward, done = env.step(action) if done: rewards.append(reward) logger.log_performance(env.timestep, np.mean(rewards)) # Close files in the logger
class MocsarPretrainddDqnAgent(Agent): """ Mocsar Rule agent version 1, take the minimal action """ name: str # Name of the agent id: str # ID of the Agent agent: DQNAgent # the pre-trained agent def __init__(self): self.name = 'PreDQNAgent' self.id = "d" # Set up the DQN agent and load the pre-trained model self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph) self.use_raw = False # Config conf = Config('environ.properties') # Set the the number of steps for collecting normalization statistics # and intial memory size memory_init_size = conf.get_int('memory_init_size') norm_step = conf.get_int('norm_step') env = rlcard3.make('mocsar_dqn') with self.graph.as_default(): self.agent = DQNAgent(self.sess, scope='dqn', action_num=env.action_num, state_shape=env.state_shape, replay_memory_size=20000, replay_memory_init_size=memory_init_size, norm_step=norm_step, mlp_layers=[512, 512]) self.normalize(env, 1000) self.sess.run(tf.compat.v1.global_variables_initializer()) check_point_path = os.path.join(ROOT_PATH, 'mocsar_dqn') with self.sess.as_default(): with self.graph.as_default(): saver = tf.train.Saver(tf.model_variables()) saver.restore(self.sess, tf.train.latest_checkpoint(check_point_path)) def __str__(self): return f"Agent:{self.name}" def step(self, state: Dict) -> str: """ Predict the action given raw state. A naive rule. Choose the minimal action. Args: state (dict): Raw state from the game Returns: action (str): Predicted action """ is_extract = state['is_extract'] action_ids = get_action_ids(legal_actions=state['legal_actions'], is_extracted=is_extract) if len(action_ids) == 1: # Ha nincs miből választani return action_to_ret(action_ids[0], is_extract) if not is_extract: obs = encode_to_obs(state=state) extracted_state = { 'obs': obs, 'legal_actions': [ string_to_action(action) for action in state['legal_actions'] ], 'is_extract': True # State is extracted } else: extracted_state = state action = self.agent.step(state=extracted_state) return action_to_ret(action=action, is_extracted=is_extract) def eval_step(self, state: Dict): """ Step for evaluation. The same to step """ return self.step(state), [] def normalize(self, e, num): """ Feed random data to normalizer Args: e (Env): AN Env class num (int): The number of steps to be normalized """ print('**********Normalize begin**************') begin_step = e.timestep e.set_agents([RandomAgent() for _ in range(e.player_num)]) while e.timestep - begin_step < num: trajectories, _ = e.run(is_training=False) for tra in trajectories: for ts in tra: self.agent.feed(ts) print('**********Normalize end**************')