def __init__(self, env, sess, low_action_bound_list, high_action_bound_list): self.env = env self.sess = sess self.low_action_bound_list = low_action_bound_list # depends on the env self.high_action_bound_list = high_action_bound_list self.action_range_bound = [ hi - lo for hi, lo in zip(self.high_action_bound_list, self.low_action_bound_list) ] self.learning_rate = 0.0001 #TODO move these to configs self.epsilon = 1.0 self.epsilon_min = 0.1 self.epsilon_decay = 1e-6 self.gamma = 0.99 self.tau = 0.001 self.buffer_size = 1000000 self.batch_size = 128 self.theta = 0.15 self.ou = 0 self.sigma = 0.3 self.state_dim = self.env.observation_space.shape[0] self.action_dim = len(self.low_action_bound_list ) #self.env.action_space, make this into input self.continuous_action_space = True # Initialize replay buffer self.replay_buffer = ReplayBuffer(self.buffer_size) # Creating ACTOR model actor_ = Actor(self.state_dim, self.action_dim, self.learning_rate) self.actor_state_input, self.actor_model = actor_.create_actor_model() _, self.target_actor_model = actor_.create_actor_model() self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.action_dim]) actor_model_weights = self.actor_model.trainable_weights self.actor_grads = tf.gradients(self.actor_model.output, actor_model_weights, -self.actor_critic_grad) grads = zip(self.actor_grads, actor_model_weights) self.optimize = tf.train.AdamOptimizer( self.learning_rate).apply_gradients(grads) # Creating CRITIC model critic_ = Critic(self.state_dim, self.action_dim, self.learning_rate) self.critic_state_input, self.critic_action_input, self.critic_model = critic_.create_critic_model( ) _, _, self.target_critic_model = critic_.create_critic_model() self.critic_grads = tf.gradients(self.critic_model.output, self.critic_action_input) self.noise = OrnsteinUhlenbeckProcess(size=self.action_dim) self.noise.reset() self.sess.run(tf.initialize_all_variables())
def __init__(self, env, sess, low_action_bound_list, high_action_bound_list): self.env = env self.sess = sess self.low_action_bound_list = low_action_bound_list # depends on the env self.high_action_bound_list = high_action_bound_list self.action_range_bound = [ hi - lo for hi, lo in zip(self.high_action_bound_list, self.low_action_bound_list) ] self.learning_rate = 0.0001 self.exploration_noise = 0.1 self.gamma = 0.90 self.tau = 0.01 self.buffer_size = 10000 self.batch_size = 128 self.policy_noise = 0.1 self.noise_clip = 0.05 self.exploration_episodes = 10 # self.policy_freq = 2 self.state_dim = self.env.observation_space.shape[0] self.action_dim = len(self.low_action_bound_list ) #self.env.action_space, make this into input self.continuous_action_space = True # Initialize replay buffer self.replay_buffer = ReplayBuffer(self.buffer_size) # Creating ACTOR model actor_ = Actor(self.state_dim, self.action_dim, self.learning_rate) self.actor_state_input, self.actor_model = actor_.create_actor_model() _, self.target_actor_model = actor_.create_actor_model() self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.action_dim]) actor_model_weights = self.actor_model.trainable_weights self.actor_grads = tf.gradients(self.actor_model.output, actor_model_weights, -self.actor_critic_grad) grads = zip(self.actor_grads, actor_model_weights) self.optimize = tf.train.AdamOptimizer( self.learning_rate).apply_gradients(grads) # Creating FIRST CRITIC model, this is the one we train/optimize against critic_ = Critic(self.state_dim, self.action_dim, self.learning_rate) self.critic_state_input, self.critic_action_input, self.critic_model = critic_.create_critic_model( ) self.critic_model.compile(optimizer=Adam(lr=critic_.learning_rate), loss='') _, _, self.target_critic_model = critic_.create_critic_model() self.target_critic_model.compile( optimizer=Adam(lr=critic_.learning_rate), loss='') self.critic_grads = tf.gradients(self.critic_model.output[0], self.critic_action_input) self.sess.run(tf.initialize_all_variables())