예제 #1
0
    def __init__(self, env, sess, low_action_bound_list,
                 high_action_bound_list):
        self.env = env
        self.sess = sess
        self.low_action_bound_list = low_action_bound_list  # depends on the env
        self.high_action_bound_list = high_action_bound_list
        self.action_range_bound = [
            hi - lo for hi, lo in zip(self.high_action_bound_list,
                                      self.low_action_bound_list)
        ]
        self.learning_rate = 0.0001  #TODO move these to configs
        self.epsilon = 1.0
        self.epsilon_min = 0.1
        self.epsilon_decay = 1e-6
        self.gamma = 0.99
        self.tau = 0.001
        self.buffer_size = 1000000
        self.batch_size = 128
        self.theta = 0.15
        self.ou = 0
        self.sigma = 0.3

        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = len(self.low_action_bound_list
                              )  #self.env.action_space, make this into input
        self.continuous_action_space = True

        # Initialize replay buffer
        self.replay_buffer = ReplayBuffer(self.buffer_size)

        # Creating ACTOR model
        actor_ = Actor(self.state_dim, self.action_dim, self.learning_rate)
        self.actor_state_input, self.actor_model = actor_.create_actor_model()
        _, self.target_actor_model = actor_.create_actor_model()

        self.actor_critic_grad = tf.placeholder(tf.float32,
                                                [None, self.action_dim])

        actor_model_weights = self.actor_model.trainable_weights
        self.actor_grads = tf.gradients(self.actor_model.output,
                                        actor_model_weights,
                                        -self.actor_critic_grad)

        grads = zip(self.actor_grads, actor_model_weights)
        self.optimize = tf.train.AdamOptimizer(
            self.learning_rate).apply_gradients(grads)

        # Creating CRITIC model
        critic_ = Critic(self.state_dim, self.action_dim, self.learning_rate)
        self.critic_state_input, self.critic_action_input, self.critic_model = critic_.create_critic_model(
        )
        _, _, self.target_critic_model = critic_.create_critic_model()

        self.critic_grads = tf.gradients(self.critic_model.output,
                                         self.critic_action_input)

        self.noise = OrnsteinUhlenbeckProcess(size=self.action_dim)
        self.noise.reset()

        self.sess.run(tf.initialize_all_variables())
예제 #2
0
    def __init__(self, env, sess, low_action_bound_list,
                 high_action_bound_list):
        self.env = env
        self.sess = sess
        self.low_action_bound_list = low_action_bound_list  # depends on the env
        self.high_action_bound_list = high_action_bound_list
        self.action_range_bound = [
            hi - lo for hi, lo in zip(self.high_action_bound_list,
                                      self.low_action_bound_list)
        ]
        self.learning_rate = 0.0001
        self.exploration_noise = 0.1
        self.gamma = 0.90
        self.tau = 0.01
        self.buffer_size = 10000
        self.batch_size = 128
        self.policy_noise = 0.1
        self.noise_clip = 0.05
        self.exploration_episodes = 10
        # self.policy_freq = 2

        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = len(self.low_action_bound_list
                              )  #self.env.action_space, make this into input
        self.continuous_action_space = True

        # Initialize replay buffer
        self.replay_buffer = ReplayBuffer(self.buffer_size)

        # Creating ACTOR model
        actor_ = Actor(self.state_dim, self.action_dim, self.learning_rate)
        self.actor_state_input, self.actor_model = actor_.create_actor_model()
        _, self.target_actor_model = actor_.create_actor_model()

        self.actor_critic_grad = tf.placeholder(tf.float32,
                                                [None, self.action_dim])

        actor_model_weights = self.actor_model.trainable_weights
        self.actor_grads = tf.gradients(self.actor_model.output,
                                        actor_model_weights,
                                        -self.actor_critic_grad)

        grads = zip(self.actor_grads, actor_model_weights)
        self.optimize = tf.train.AdamOptimizer(
            self.learning_rate).apply_gradients(grads)

        # Creating FIRST CRITIC model, this is the one we train/optimize against
        critic_ = Critic(self.state_dim, self.action_dim, self.learning_rate)
        self.critic_state_input, self.critic_action_input, self.critic_model = critic_.create_critic_model(
        )
        self.critic_model.compile(optimizer=Adam(lr=critic_.learning_rate),
                                  loss='')

        _, _, self.target_critic_model = critic_.create_critic_model()
        self.target_critic_model.compile(
            optimizer=Adam(lr=critic_.learning_rate), loss='')

        self.critic_grads = tf.gradients(self.critic_model.output[0],
                                         self.critic_action_input)

        self.sess.run(tf.initialize_all_variables())