def __init__(self, state_size, action_size): l1_size = simple_actor_network.l1_size l2_size = simple_actor_network.l2_size self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf.Session() self.state_input = tf.placeholder(tf.float32, [None, state_size]) self.W1 = tf.Variable( tf.random_uniform([state_size, l1_size], -1 / math.sqrt(state_size), 1 / math.sqrt(state_size))) self.W2 = tf.Variable( tf.random_uniform([l1_size, l2_size], -1 / math.sqrt(l1_size), 1 / math.sqrt(l1_size))) self.W3 = tf.Variable( tf.random_uniform([l2_size, action_size], -0.0003, 0.0003)) self.b1 = tf.Variable( tf.random_uniform([l1_size], -1 / math.sqrt(state_size), 1 / math.sqrt(state_size))) self.b2 = tf.Variable( tf.random_uniform([l2_size], -1 / math.sqrt(l1_size), 1 / math.sqrt(l1_size))) self.b3 = tf.Variable( tf.random_uniform([action_size], -0.0003, 0.0003)) self.W1_target = tf.Variable(tf.zeros([state_size, l1_size]), trainable=False) self.W2_target = tf.Variable(tf.zeros([l1_size, l2_size]), trainable=False) self.W3_target = tf.Variable(tf.zeros([l2_size, action_size]), trainable=False) self.b1_target = tf.Variable(tf.zeros([l1_size]), trainable=False) self.b2_target = tf.Variable(tf.zeros([l2_size]), trainable=False) self.b3_target = tf.Variable(tf.zeros([action_size]), trainable=False) self.bnTrain = tf.placeholder(tf.bool, []) self.bef_x1 = tf.matmul(self.state_input, self.W1) + self.b1 self.bn1 = batch_norm(self.bef_x1, l1_size, self.bnTrain, self.sess) self.x1 = tf.nn.softplus(self.bn1.xNorm) self.bef_x2 = tf.matmul(self.x1, self.W2) + self.b2 self.bn2 = batch_norm(self.bef_x2, l2_size, self.bnTrain, self.sess) self.x2 = tf.nn.tanh(self.bn2.xNorm) self.action_output = tf.matmul(self.x2, self.W3) + self.b3 self.bef_x1T = tf.matmul(self.state_input, self.W1_target) + self.b1_target self.bn1T = batch_norm(self.bef_x1T, l1_size, self.bnTrain, self.sess, self.bn1) self.x1_target = tf.nn.softplus(self.bn1T.xNorm) self.bef_x2T = tf.matmul(self.x1_target, self.W2_target) + self.b2_target self.bn2T = batch_norm(self.bef_x2T, l2_size, self.bnTrain, self.sess, self.bn2) self.x2_target = tf.nn.tanh(self.bn2T.xNorm) self.action_output_target = tf.matmul( self.x2_target, self.W3_target) + self.b3_target self.action_gradient = tf.placeholder(tf.float32, [None, action_size]) self.params = [ self.W1, self.W2, self.W3, self.b1, self.b2, self.b3, self.bn1.gamma, self.bn1.beta, self.bn2.beta, self.bn2.gamma ] self.params_grad = tf.gradients(self.action_output, self.params, -self.action_gradient) self.adam = tf.train.AdamOptimizer( simple_actor_network.learning_rate) self.optimizer = tf.train.GradientDescentOptimizer( simple_actor_network.learning_rate) self.updater = self.adam.apply_gradients( zip(self.params_grad, self.params)) init = tf.initialize_all_variables() self.sess.run(init) self.sess.run([ self.W1_target.assign(self.W1), self.W2_target.assign(self.W2), self.W3_target.assign(self.W3), self.b1_target.assign(self.b1), self.b2_target.assign(self.b2), self.b3_target.assign(self.b3) ]) self.upTargW1 = self.W1_target.assign( self.W1_target * (1 - simple_actor_network.ts) + self.W1 * (simple_actor_network.ts)) self.upTargW2 = self.W2_target.assign( self.W2_target * (1 - simple_actor_network.ts) + self.W2 * (simple_actor_network.ts)) self.upTargW3 = self.W3_target.assign( self.W3_target * (1 - simple_actor_network.ts) + self.W3 * (simple_actor_network.ts)) self.upTargb1 = self.b1_target.assign( self.b1_target * (1 - simple_actor_network.ts) + self.b1 * (simple_actor_network.ts)) self.upTargb2 = self.b2_target.assign( self.b2_target * (1 - simple_actor_network.ts) + self.b2 * (simple_actor_network.ts)) self.upTargb3 = self.b3_target.assign( self.b3_target * (1 - simple_actor_network.ts) + self.b3 * (simple_actor_network.ts)) self.batch_state = [] self.batch_actgrad = []
def __init__(self, state_size, action_size): l1_size = simple_actor_network.l1_size l2_size = simple_actor_network.l2_size self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf.Session() self.state_input = tf.placeholder(tf.float32, [None, state_size]) self.W1 = tf.Variable(tf.random_uniform([state_size, l1_size], -1/math.sqrt(state_size), 1/math.sqrt(state_size))) self.W2 = tf.Variable(tf.random_uniform([l1_size, l2_size], -1/math.sqrt(l1_size), 1/math.sqrt(l1_size))) self.W3 = tf.Variable(tf.random_uniform([l2_size, action_size], -0.0003, 0.0003)) self.b1 = tf.Variable(tf.random_uniform([l1_size], -1/math.sqrt(state_size), 1/math.sqrt(state_size))) self.b2 = tf.Variable(tf.random_uniform([l2_size], -1/math.sqrt(l1_size), 1/math.sqrt(l1_size))) self.b3 = tf.Variable(tf.random_uniform([action_size], -0.0003, 0.0003)) self.W1_target = tf.Variable(tf.zeros([state_size, l1_size]), trainable = False) self.W2_target = tf.Variable(tf.zeros([l1_size, l2_size]), trainable = False) self.W3_target = tf.Variable(tf.zeros([l2_size, action_size]), trainable = False) self.b1_target = tf.Variable(tf.zeros([l1_size]), trainable = False) self.b2_target = tf.Variable(tf.zeros([l2_size]), trainable = False) self.b3_target = tf.Variable(tf.zeros([action_size]), trainable = False) self.bnTrain = tf.placeholder(tf.bool, []) self.bef_x1 = tf.matmul(self.state_input,self.W1) + self.b1 self.bn1 = batch_norm(self.bef_x1, l1_size, self.bnTrain,self.sess) self.x1 = tf.nn.softplus(self.bn1.xNorm) self.bef_x2 = tf.matmul(self.x1,self.W2) + self.b2 self.bn2 = batch_norm(self.bef_x2, l2_size, self.bnTrain,self.sess) self.x2 = tf.nn.tanh(self.bn2.xNorm) self.action_output = tf.matmul(self.x2,self.W3) + self.b3 self.bef_x1T = tf.matmul(self.state_input,self.W1_target) + self.b1_target self.bn1T = batch_norm(self.bef_x1T, l1_size, self.bnTrain,self.sess, self.bn1) self.x1_target = tf.nn.softplus(self.bn1T.xNorm) self.bef_x2T = tf.matmul(self.x1_target,self.W2_target) + self.b2_target self.bn2T = batch_norm(self.bef_x2T, l2_size, self.bnTrain,self.sess, self.bn2) self.x2_target = tf.nn.tanh(self.bn2T.xNorm) self.action_output_target = tf.matmul(self.x2_target,self.W3_target) + self.b3_target self.action_gradient = tf.placeholder(tf.float32, [None, action_size]) self.params = [self.W1, self.W2, self.W3, self.b1, self.b2, self.b3, self.bn1.gamma, self.bn1.beta, self.bn2.beta, self.bn2.gamma] self.params_grad = tf.gradients(self.action_output, self.params, -self.action_gradient) self.adam = tf.train.AdamOptimizer(simple_actor_network.learning_rate) self.optimizer = tf.train.GradientDescentOptimizer(simple_actor_network.learning_rate) self.updater = self.adam.apply_gradients(zip(self.params_grad, self.params)) init = tf.initialize_all_variables() self.sess.run(init) self.sess.run([self.W1_target.assign(self.W1), self.W2_target.assign(self.W2), self.W3_target.assign(self.W3), self.b1_target.assign(self.b1), self.b2_target.assign(self.b2), self.b3_target.assign(self.b3) ]) self.upTargW1 = self.W1_target.assign(self.W1_target*(1-simple_actor_network.ts)+ self.W1*(simple_actor_network.ts)) self.upTargW2 = self.W2_target.assign(self.W2_target*(1-simple_actor_network.ts)+ self.W2*(simple_actor_network.ts)) self.upTargW3 = self.W3_target.assign(self.W3_target*(1-simple_actor_network.ts)+ self.W3*(simple_actor_network.ts)) self.upTargb1 = self.b1_target.assign(self.b1_target*(1-simple_actor_network.ts)+ self.b1*(simple_actor_network.ts)) self.upTargb2 = self.b2_target.assign(self.b2_target*(1-simple_actor_network.ts)+ self.b2*(simple_actor_network.ts)) self.upTargb3 = self.b3_target.assign(self.b3_target*(1-simple_actor_network.ts)+ self.b3*(simple_actor_network.ts)) self.batch_state = [] self.batch_actgrad = []
def __init__(self, state_size, action_size, action_bound = None): l1_size = simple_critic_network.l1_size l2_size = simple_critic_network.l2_size self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf.Session() self.state_input = tf.placeholder(tf.float32, [None, state_size]) self.action_input = tf.placeholder(tf.float32, [None, action_size]) #self.action_input_1d = tf.placeholder(tf.float32, [action_size]) self.W1 = tf.Variable(tf.random_uniform([state_size, l1_size], -1/math.sqrt(state_size), 1/math.sqrt(state_size))) self.W2 = tf.Variable(tf.random_uniform([l1_size, l2_size], -1/math.sqrt(l1_size+action_size), 1/math.sqrt(l1_size+action_size))) self.W2_action = tf.Variable(tf.random_uniform([action_size, l2_size], -1/math.sqrt(l1_size+action_size), 1/math.sqrt(l1_size+action_size))) self.W3 = tf.Variable(tf.random_uniform([l2_size, 1], -0.0003, 0.0003)) self.b1 = tf.Variable(tf.random_uniform([l1_size], -1/math.sqrt(state_size), 1/math.sqrt(state_size))) self.b2 = tf.Variable(tf.random_uniform([l2_size], -1/math.sqrt(l1_size+action_size), 1/math.sqrt(l1_size+action_size))) self.b3 = tf.Variable(tf.random_uniform([1], -0.0003, 0.0003)) self.W1_target = tf.Variable(tf.zeros([state_size, l1_size]), trainable = False) self.W2_target = tf.Variable(tf.zeros([l1_size, l2_size]), trainable = False) self.W2_action_target = tf.Variable(tf.zeros([action_size, l2_size]), trainable = False) self.W3_target = tf.Variable(tf.zeros([l2_size, 1]), trainable = False) self.b1_target = tf.Variable(tf.zeros([l1_size]), trainable = False) self.b2_target = tf.Variable(tf.zeros([l2_size]), trainable = False) self.b3_target = tf.Variable(tf.zeros([1]), trainable = False) self.bnTrain = tf.placeholder(tf.bool, []) self.bef_x1 = tf.matmul(self.state_input,self.W1) + self.b1 self.bn1 = batch_norm(self.bef_x1, l1_size, self.bnTrain,self.sess) self.x1 = tf.nn.softplus(self.bn1.xNorm) self.bef_x2 = tf.matmul(self.x1,self.W2) + tf.matmul(self.action_input,self.W2_action) + self.b2 self.bn2 = batch_norm(self.bef_x2, l2_size, self.bnTrain,self.sess) self.x2 = tf.nn.softplus(self.bn2.xNorm) self.qval_output = tf.matmul(self.x2,self.W3) + self.b3 self.bef_x1T = tf.matmul(self.state_input,self.W1_target) + self.b1_target self.bn1T = batch_norm(self.bef_x1T, l1_size, self.bnTrain, self.sess,self.bn1) self.x1_target = tf.nn.softplus(self.bn1T.xNorm) self.bef_x2T = tf.matmul(self.x1_target,self.W2_target) + tf.matmul(self.action_input,self.W2_action_target) + self.b2_target self.bn2T = batch_norm(self.bef_x2T, l2_size, self.bnTrain, self.sess,self.bn2) self.x2_target = tf.nn.softplus(self.bn2T.xNorm) self.qval_output_target = tf.matmul(self.x2_target,self.W3_target) + self.b3_target self.act_grad_v = tf.gradients(self.qval_output, self.action_input) self.act_grad = [self.act_grad_v[0]/tf.to_float(tf.shape(self.act_grad_v[0])[0])] self.qval_train = tf.placeholder(tf.float32, [None, 1]) self.diff = tf.pow(self.qval_output-self.qval_train, 2)/tf.to_float(tf.shape(self.qval_train)[0]) + 0.01*tf.reduce_sum(tf.pow(self.W2,2))+ 0.01*tf.reduce_sum(tf.pow(self.b2,2)) #self.params = [self.W1, self.W2, self.W2_action, self.W3, self.b1, self.b2, self.b3] #self.params_grad = tf.gradients(self.diff, self.params) self.adam = tf.train.AdamOptimizer(simple_critic_network.learning_rate) self.optimizer = self.adam.minimize(self.diff) init = tf.initialize_all_variables() self.sess.run(init) self.sess.run([self.W1_target.assign(self.W1), self.W2_target.assign(self.W2), self.W2_action_target.assign(self.W2_action), self.W3_target.assign(self.W3), self.b1_target.assign(self.b1), self.b2_target.assign(self.b2), self.b3_target.assign(self.b3) ]) self.upTargW1 =self.W1_target.assign(self.W1_target*(1-simple_critic_network.ts)+ self.W1*(simple_critic_network.ts)) self.upTargW2 =self.W2_target.assign(self.W2_target*(1-simple_critic_network.ts)+ self.W2*(simple_critic_network.ts)) self.upTargW2a =self.W2_action_target.assign(self.W2_action_target*(1-simple_critic_network.ts)+ self.W2_action*(simple_critic_network.ts)) self.upTargW3 =self.W3_target.assign(self.W3_target*(1-simple_critic_network.ts)+ self.W3*(simple_critic_network.ts)) self.upTargb1 =self.b1_target.assign(self.b1_target*(1-simple_critic_network.ts)+ self.b1*(simple_critic_network.ts)) self.upTargb2 =self.b2_target.assign(self.b2_target*(1-simple_critic_network.ts)+ self.b2*(simple_critic_network.ts)) self.upTargb3 =self.b3_target.assign(self.b3_target*(1-simple_critic_network.ts)+ self.b3*(simple_critic_network.ts)) self.batch_state = [] self.batch_action = [] self.batch_val = [] self.gamma = 0.99 self.rewards = tf.placeholder(tf.float32, [None, 1]) self.q_vals_batch = tf.placeholder(tf.float32, [None, 1]) self.y_opp = self.rewards + self.q_vals_batch*self.gamma