コード例 #1
0
    def test_train(self):

        memory_init_size = 100
        step_num = 1500

        sess = tf.InteractiveSession()
        tf.Variable(0, name='global_step', trainable=False)
        agent = DQNAgent(sess=sess,
                         scope='dqn',
                         replay_memory_size=500,
                         replay_memory_init_size=memory_init_size,
                         update_target_estimator_every=100,
                         state_shape=[2],
                         mlp_layers=[10, 10])
        sess.run(tf.global_variables_initializer())

        predicted_action, _ = agent.eval_step({
            'obs':
            np.random.random_sample((2, )),
            'legal_actions': [0, 1]
        })
        self.assertGreaterEqual(predicted_action, 0)
        self.assertLessEqual(predicted_action, 1)

        for _ in range(step_num):
            ts = [{
                'obs': np.random.random_sample((2, )),
                'legal_actions': [0, 1]
            },
                  np.random.randint(2), 0, {
                      'obs': np.random.random_sample((2, )),
                      'legal_actions': [0, 1]
                  }, True]
            agent.feed(ts)

        predicted_action = agent.step({
            'obs': np.random.random_sample((2, )),
            'legal_actions': [0, 1]
        })
        self.assertGreaterEqual(predicted_action, 0)
        self.assertLessEqual(predicted_action, 1)

        sess.close()
        tf.reset_default_graph()
コード例 #2
0
                     scope='dqn',
                     action_num=env.action_num,
                     replay_memory_init_size=memory_init_size,
                     train_every=train_every,
                     state_shape=env.state_shape,
                     mlp_layers=[128, 128])
    # Initialize global variables
    sess.run(tf.global_variables_initializer())

    # Init a Logger to plot the learning curve
    logger = Logger(log_dir)

    state = env.reset()

    for timestep in range(timesteps):
        action = agent.step(state)
        next_state, reward, done = env.step(action)
        ts = (state, action, reward, next_state, done)
        agent.feed(ts)

        if timestep % evaluate_every == 0:
            rewards = []
            state = eval_env.reset()
            for _ in range(evaluate_num):
                action, _ = agent.eval_step(state)
                _, reward, done = env.step(action)
                if done:
                    rewards.append(reward)
            logger.log_performance(env.timestep, np.mean(rewards))

    # Close files in the logger
コード例 #3
0
class MocsarPretrainddDqnAgent(Agent):
    """ Mocsar Rule agent version 1, take the minimal action
    """
    name: str  # Name of the agent
    id: str  # ID of the Agent
    agent: DQNAgent  # the pre-trained agent

    def __init__(self):
        self.name = 'PreDQNAgent'
        self.id = "d"
        # Set up the DQN agent and load the pre-trained model
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)
        self.use_raw = False
        # Config
        conf = Config('environ.properties')
        # Set the the number of steps for collecting normalization statistics
        # and intial memory size
        memory_init_size = conf.get_int('memory_init_size')
        norm_step = conf.get_int('norm_step')
        env = rlcard3.make('mocsar_dqn')
        with self.graph.as_default():
            self.agent = DQNAgent(self.sess,
                                  scope='dqn',
                                  action_num=env.action_num,
                                  state_shape=env.state_shape,
                                  replay_memory_size=20000,
                                  replay_memory_init_size=memory_init_size,
                                  norm_step=norm_step,
                                  mlp_layers=[512, 512])
            self.normalize(env, 1000)
            self.sess.run(tf.compat.v1.global_variables_initializer())
        check_point_path = os.path.join(ROOT_PATH, 'mocsar_dqn')
        with self.sess.as_default():
            with self.graph.as_default():
                saver = tf.train.Saver(tf.model_variables())
                saver.restore(self.sess,
                              tf.train.latest_checkpoint(check_point_path))

    def __str__(self):
        return f"Agent:{self.name}"

    def step(self, state: Dict) -> str:
        """ Predict the action given raw state. A naive rule.
        Choose the minimal action.

        Args:
            state (dict): Raw state from the game

        Returns:
            action (str): Predicted action
        """
        is_extract = state['is_extract']
        action_ids = get_action_ids(legal_actions=state['legal_actions'],
                                    is_extracted=is_extract)
        if len(action_ids) == 1:
            # Ha nincs miből választani
            return action_to_ret(action_ids[0], is_extract)

        if not is_extract:
            obs = encode_to_obs(state=state)

            extracted_state = {
                'obs':
                obs,
                'legal_actions': [
                    string_to_action(action)
                    for action in state['legal_actions']
                ],
                'is_extract':
                True  # State is extracted
            }
        else:
            extracted_state = state
        action = self.agent.step(state=extracted_state)
        return action_to_ret(action=action, is_extracted=is_extract)

    def eval_step(self, state: Dict):
        """ Step for evaluation. The same to step
                """
        return self.step(state), []

    def normalize(self, e, num):
        """ Feed random data to normalizer

        Args:
            e (Env): AN Env class

            num (int): The number of steps to be normalized

        """
        print('**********Normalize begin**************')
        begin_step = e.timestep
        e.set_agents([RandomAgent() for _ in range(e.player_num)])
        while e.timestep - begin_step < num:
            trajectories, _ = e.run(is_training=False)

            for tra in trajectories:
                for ts in tra:
                    self.agent.feed(ts)
        print('**********Normalize end**************')