def __init__(self, state_size, action_size):
        l1_size = simple_actor_network.l1_size
        l2_size = simple_actor_network.l2_size

        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sess = tf.Session()

            self.state_input = tf.placeholder(tf.float32, [None, state_size])

            self.W1 = weight_norm(
                state_size, l1_size,
                [-1 / math.sqrt(state_size), 1 / math.sqrt(state_size)],
                self.graph)
            self.W2 = weight_norm(
                l1_size, l2_size,
                [-1 / math.sqrt(l1_size), 1 / math.sqrt(l1_size)], self.graph)
            self.W3 = weight_norm(l2_size, action_size, [-0.0003, 0.0003],
                                  self.graph)

            self.b1 = tf.Variable(
                tf.random_uniform([l1_size], -1 / math.sqrt(state_size),
                                  1 / math.sqrt(state_size)))
            self.b2 = tf.Variable(
                tf.random_uniform([l2_size], -1 / math.sqrt(l1_size),
                                  1 / math.sqrt(l1_size)))
            self.b3 = tf.Variable(
                tf.random_uniform([action_size], -0.0003, 0.0003))

            self.W1_target = tf.Variable(tf.zeros([state_size, l1_size]))
            self.W2_target = tf.Variable(tf.zeros([l1_size, l2_size]))
            self.W3_target = tf.Variable(tf.zeros([l2_size, action_size]))

            self.b1_target = tf.Variable(tf.zeros([l1_size]))
            self.b2_target = tf.Variable(tf.zeros([l2_size]))
            self.b3_target = tf.Variable(tf.zeros([action_size]))

            self.x1 = tf.nn.softplus(
                tf.matmul(self.state_input, self.W1.w) + self.b1)
            self.x2 = tf.nn.tanh(tf.matmul(self.x1, self.W2.w) + self.b2)
            self.action_output = tf.matmul(self.x2, self.W3.w) + self.b3

            self.x1_target = tf.nn.softplus(
                tf.matmul(self.state_input, self.W1_target) + self.b1_target)
            self.x2_target = tf.nn.tanh(
                tf.matmul(self.x1_target, self.W2_target) + self.b2_target)
            self.action_output_target = tf.matmul(
                self.x2_target, self.W3_target) + self.b3_target

            self.action_gradient = tf.placeholder(tf.float32,
                                                  [None, action_size])
            self.params = [
                self.W1.v, self.W1.g, self.W2.v, self.W2.g, self.W3.v,
                self.W3.g, self.b1, self.b2, self.b3
            ]
            self.params_grad = tf.gradients(self.action_output, self.params,
                                            -self.action_gradient)

            self.adam = tf.train.AdamOptimizer(
                simple_actor_network.learning_rate)
            #self.optimizer = tf.train.GradientDescentOptimizer(simple_actor_network.learning_rate)
            self.updater = self.adam.apply_gradients(
                zip(self.params_grad, self.params))

            init = tf.initialize_all_variables()
            self.sess.run(init)

            self.sess.run([
                self.W1_target.assign(self.W1.w),
                self.W2_target.assign(self.W2.w),
                self.W3_target.assign(self.W3.w),
                self.b1_target.assign(self.b1),
                self.b2_target.assign(self.b2),
                self.b3_target.assign(self.b3)
            ])

            self.upTargW1 = self.W1_target.assign(
                self.W1_target * (1 - simple_actor_network.ts) + self.W1.w *
                (simple_actor_network.ts))
            self.upTargW2 = self.W2_target.assign(
                self.W2_target * (1 - simple_actor_network.ts) + self.W2.w *
                (simple_actor_network.ts))
            self.upTargW3 = self.W3_target.assign(
                self.W3_target * (1 - simple_actor_network.ts) + self.W3.w *
                (simple_actor_network.ts))

            self.upTargb1 = self.b1_target.assign(
                self.b1_target * (1 - simple_actor_network.ts) + self.b1 *
                (simple_actor_network.ts))
            self.upTargb2 = self.b2_target.assign(
                self.b2_target * (1 - simple_actor_network.ts) + self.b2 *
                (simple_actor_network.ts))
            self.upTargb3 = self.b3_target.assign(
                self.b3_target * (1 - simple_actor_network.ts) + self.b3 *
                (simple_actor_network.ts))

            #        init = tf.initialize_variables([self.W1_target, self.W2_target, self.W3_target, self.b1_target, self.b2_target, self.b3_target])
            #
            #        self.sess.run(init)

            self.batch_state = []
            self.batch_actgrad = []
Beispiel #2
0
    def __init__(self, state_size, action_size, action_bound = None):
        l1_size = simple_critic_network.l1_size
        l2_size = simple_critic_network.l2_size
        
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sess = tf.Session()

            
            self.state_input = tf.placeholder(tf.float32, [None, state_size])
            self.action_input = tf.placeholder(tf.float32, [None, action_size])
            #self.action_input_1d = tf.placeholder(tf.float32, [action_size])
    
            self.W1 = weight_norm(state_size, l1_size, [-1/math.sqrt(state_size), 1/math.sqrt(state_size)], self.graph).w
            self.W2 = weight_norm(l1_size, l2_size,[-1/math.sqrt(l1_size+action_size), 1/math.sqrt(l1_size+action_size)], self.graph).w
            self.W2_action = weight_norm(action_size, l2_size,[-1/math.sqrt(l1_size+action_size), 1/math.sqrt(l1_size+action_size)], self.graph).w
            self.W3 = weight_norm(l2_size, 1, [-0.0003, 0.0003], self.graph).w
    
            self.b1 = tf.Variable(tf.random_uniform([l1_size], -1/math.sqrt(state_size), 1/math.sqrt(state_size)))
            self.b2 = tf.Variable(tf.random_uniform([l2_size], -1/math.sqrt(l1_size+action_size), 1/math.sqrt(l1_size+action_size)))
            self.b3 = tf.Variable(tf.random_uniform([1], -0.0003, 0.0003))
    
            self.W1_target = tf.Variable(tf.zeros([state_size, l1_size]))
            self.W2_target = tf.Variable(tf.zeros([l1_size, l2_size]))
            self.W2_action_target = tf.Variable(tf.zeros([action_size, l2_size]))
            self.W3_target = tf.Variable(tf.zeros([l2_size, 1]))
            
            self.b1_target = tf.Variable(tf.zeros([l1_size]))
            self.b2_target = tf.Variable(tf.zeros([l2_size]))
            self.b3_target = tf.Variable(tf.zeros([1]))
            
            self.x1 = tf.nn.softplus(tf.matmul(self.state_input,self.W1) + self.b1)
            self.x2 = tf.nn.softplus(tf.matmul(self.x1,self.W2) + tf.matmul(self.action_input,self.W2_action) + self.b2)
            self.qval_output = tf.matmul(self.x2,self.W3) + self.b3
    
            self.x1_target = tf.nn.softplus(tf.matmul(self.state_input,self.W1_target) + self.b1_target)
            self.x2_target = tf.nn.softplus(tf.matmul(self.x1_target,self.W2_target) + tf.matmul(self.action_input,self.W2_action_target) + self.b2_target)
            self.qval_output_target = tf.matmul(self.x2_target,self.W3_target) + self.b3_target
    
            self.act_grad_v = tf.gradients(self.qval_output, self.action_input)
            self.act_grad = [self.act_grad_v[0]/tf.to_float(tf.shape(self.act_grad_v[0])[0])]
            
            
            self.qval_train = tf.placeholder(tf.float32, [None, 1])
            self.diff =tf.pow(self.qval_output-self.qval_train, 2)/tf.to_float(tf.shape(self.qval_train)[0]) + 0.01*tf.reduce_sum(tf.pow(self.W2,2))+ 0.01*tf.reduce_sum(tf.pow(self.b2,2))
            #self.params = [self.W1, self.W2, self.W2_action, self.W3, self.b1, self.b2, self.b3]
            #self.params_grad = tf.gradients(self.diff, self.params)
            
            self.adam = tf.train.AdamOptimizer(simple_critic_network.learning_rate)       
            self.optimizer = self.adam.minimize(self.diff)
            
            init = tf.initialize_all_variables()
            self.sess.run(init)
            
    
            self.sess.run([self.W1_target.assign(self.W1),
                           self.W2_target.assign(self.W2),
                           self.W2_action_target.assign(self.W2_action),
                           self.W3_target.assign(self.W3),
                           self.b1_target.assign(self.b1),
                           self.b2_target.assign(self.b2),
                           self.b3_target.assign(self.b3) ])  
            
            #self.optimizer = tf.train.GradientDescentOptimizer(simple_critic_network.learning_rate).minimize(self.diff)
            #self.updater = self.optimizer.apply_gradients(zip(self.params_grad, self.params))        
            
            self.upTargW1 =self.W1_target.assign(self.W1_target*(1-simple_critic_network.ts)+ self.W1*(simple_critic_network.ts))        
            self.upTargW2 =self.W2_target.assign(self.W2_target*(1-simple_critic_network.ts)+ self.W2*(simple_critic_network.ts))
            self.upTargW2a =self.W2_action_target.assign(self.W2_action_target*(1-simple_critic_network.ts)+ self.W2_action*(simple_critic_network.ts))
            self.upTargW3 =self.W3_target.assign(self.W3_target*(1-simple_critic_network.ts)+ self.W3*(simple_critic_network.ts))
            
            self.upTargb1 =self.b1_target.assign(self.b1_target*(1-simple_critic_network.ts)+ self.b1*(simple_critic_network.ts))
            self.upTargb2 =self.b2_target.assign(self.b2_target*(1-simple_critic_network.ts)+ self.b2*(simple_critic_network.ts))
            self.upTargb3 =self.b3_target.assign(self.b3_target*(1-simple_critic_network.ts)+ self.b3*(simple_critic_network.ts))
            
    #        init = tf.initialize_variables([self.W1_target, self.W2_target, self.W2_action_target, self.W3_target, self.b1_target, self.b2_target, self.b3_target])        
    #        
    #        self.sess.run(init)
            
            self.batch_state = []
            self.batch_action = []
            self.batch_val = []
            
            
            self.gamma = 0.99
            self.rewards = tf.placeholder(tf.float32, [None, 1])
            self.q_vals_batch = tf.placeholder(tf.float32, [None, 1])
            self.y_opp = self.rewards + self.q_vals_batch*self.gamma
Beispiel #3
0
    def __init__(self, state_size, action_size, action_bound = None,l1_size = 300, l2_size = 200, learning_rate = 0.0001):
        
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sess = tf.Session()

            
            self.state_input = tf.placeholder(tf.float32, [None, state_size])
            self.action_input = tf.placeholder(tf.float32, [None, action_size])
            #self.action_input_1d = tf.placeholder(tf.float32, [action_size])
    
            self.W1 = weight_norm(state_size, l1_size, [-1/math.sqrt(state_size), 1/math.sqrt(state_size)], self.graph).w
            self.W2 = weight_norm(l1_size, l2_size,[-1/math.sqrt(l1_size+action_size), 1/math.sqrt(l1_size+action_size)], self.graph).w
            self.W2_action = weight_norm(action_size, l2_size,[-1/math.sqrt(l1_size+action_size), 1/math.sqrt(l1_size+action_size)], self.graph).w
            self.W3 = weight_norm(l2_size, 1, [-0.0003, 0.0003], self.graph).w
    
            self.b1 = tf.Variable(tf.random_uniform([l1_size], -1/math.sqrt(state_size), 1/math.sqrt(state_size)))
            self.b2 = tf.Variable(tf.random_uniform([l2_size], -1/math.sqrt(l1_size+action_size), 1/math.sqrt(l1_size+action_size)))
            self.b3 = tf.Variable(tf.random_uniform([1], -0.0003, 0.0003))
    
            self.W1_target = tf.Variable(tf.zeros([state_size, l1_size]))
            self.W2_target = tf.Variable(tf.zeros([l1_size, l2_size]))
            self.W2_action_target = tf.Variable(tf.zeros([action_size, l2_size]))
            self.W3_target = tf.Variable(tf.zeros([l2_size, 1]))
            
            self.b1_target = tf.Variable(tf.zeros([l1_size]))
            self.b2_target = tf.Variable(tf.zeros([l2_size]))
            self.b3_target = tf.Variable(tf.zeros([1]))
            
            self.x1 = tf.nn.softplus(tf.matmul(self.state_input,self.W1) + self.b1)
            self.x2 = tf.nn.softplus(tf.matmul(self.x1,self.W2) + tf.matmul(self.action_input,self.W2_action) + self.b2)
            self.qval_output = tf.matmul(self.x2,self.W3) + self.b3
    
            self.x1_target = tf.nn.softplus(tf.matmul(self.state_input,self.W1_target) + self.b1_target)
            self.x2_target = tf.nn.softplus(tf.matmul(self.x1_target,self.W2_target) + tf.matmul(self.action_input,self.W2_action_target) + self.b2_target)
            self.qval_output_target = tf.matmul(self.x2_target,self.W3_target) + self.b3_target
    
            self.act_grad_v = tf.gradients(self.qval_output, self.action_input)
            self.act_grad = [self.act_grad_v[0]/tf.to_float(tf.shape(self.act_grad_v[0])[0])]
            
            
            self.qval_train = tf.placeholder(tf.float32, [None, 1])
            self.diff =tf.pow(self.qval_output-self.qval_train, 2)/tf.to_float(tf.shape(self.qval_train)[0]) + 0.01*tf.reduce_sum(tf.pow(self.W2,2))+ 0.01*tf.reduce_sum(tf.pow(self.b2,2))
            #self.params = [self.W1, self.W2, self.W2_action, self.W3, self.b1, self.b2, self.b3]
            #self.params_grad = tf.gradients(self.diff, self.params)
            
            self.adam = tf.train.AdamOptimizer(learning_rate)       
            self.optimizer = self.adam.minimize(self.diff)
            
            init = tf.initialize_all_variables()
            self.sess.run(init)
            
    
            self.sess.run([self.W1_target.assign(self.W1),
                           self.W2_target.assign(self.W2),
                           self.W2_action_target.assign(self.W2_action),
                           self.W3_target.assign(self.W3),
                           self.b1_target.assign(self.b1),
                           self.b2_target.assign(self.b2),
                           self.b3_target.assign(self.b3) ])  
            
            #self.optimizer = tf.train.GradientDescentOptimizer(simple_critic_network.learning_rate).minimize(self.diff)
            #self.updater = self.optimizer.apply_gradients(zip(self.params_grad, self.params))        
            
            self.upTargW1 =self.W1_target.assign(self.W1_target*(1-simple_critic_network.ts)+ self.W1*(simple_critic_network.ts))        
            self.upTargW2 =self.W2_target.assign(self.W2_target*(1-simple_critic_network.ts)+ self.W2*(simple_critic_network.ts))
            self.upTargW2a =self.W2_action_target.assign(self.W2_action_target*(1-simple_critic_network.ts)+ self.W2_action*(simple_critic_network.ts))
            self.upTargW3 =self.W3_target.assign(self.W3_target*(1-simple_critic_network.ts)+ self.W3*(simple_critic_network.ts))
            
            self.upTargb1 =self.b1_target.assign(self.b1_target*(1-simple_critic_network.ts)+ self.b1*(simple_critic_network.ts))
            self.upTargb2 =self.b2_target.assign(self.b2_target*(1-simple_critic_network.ts)+ self.b2*(simple_critic_network.ts))
            self.upTargb3 =self.b3_target.assign(self.b3_target*(1-simple_critic_network.ts)+ self.b3*(simple_critic_network.ts))
            
    #        init = tf.initialize_variables([self.W1_target, self.W2_target, self.W2_action_target, self.W3_target, self.b1_target, self.b2_target, self.b3_target])        
    #        
    #        self.sess.run(init)
            
            self.batch_state = []
            self.batch_action = []
            self.batch_val = []
            
            
            self.gamma = 0.99
            self.rewards = tf.placeholder(tf.float32, [None, 1])
            self.q_vals_batch = tf.placeholder(tf.float32, [None, 1])
            self.y_opp = self.rewards + self.q_vals_batch*self.gamma
 def __init__(self, state_size, action_size):
     l1_size = simple_actor_network.l1_size
     l2_size = simple_actor_network.l2_size
     
     self.graph = tf.Graph()
     with self.graph.as_default():
         self.sess = tf.Session()
     
         self.state_input = tf.placeholder(tf.float32, [None, state_size])
 
         self.W1 = weight_norm(state_size, l1_size,[-1/math.sqrt(state_size), 1/math.sqrt(state_size)], self.graph)
         self.W2 = weight_norm(l1_size, l2_size, [-1/math.sqrt(l1_size), 1/math.sqrt(l1_size)], self.graph)
         self.W3 = weight_norm(l2_size, action_size, [-0.0003, 0.0003],self.graph)
 
         self.b1 = tf.Variable(tf.random_uniform([l1_size], -1/math.sqrt(state_size), 1/math.sqrt(state_size)))
         self.b2 = tf.Variable(tf.random_uniform([l2_size], -1/math.sqrt(l1_size), 1/math.sqrt(l1_size)))
         self.b3 = tf.Variable(tf.random_uniform([action_size], -0.0003, 0.0003))
 
         self.W1_target = tf.Variable(tf.zeros([state_size, l1_size]))
         self.W2_target = tf.Variable(tf.zeros([l1_size, l2_size]))
         self.W3_target = tf.Variable(tf.zeros([l2_size, action_size]))
         
         self.b1_target = tf.Variable(tf.zeros([l1_size]))
         self.b2_target = tf.Variable(tf.zeros([l2_size]))
         self.b3_target = tf.Variable(tf.zeros([action_size]))
         
         
         self.x1 = tf.nn.softplus(tf.matmul(self.state_input,self.W1.w) + self.b1)
         self.x2 = tf.nn.tanh(tf.matmul(self.x1,self.W2.w) + self.b2)
         self.action_output = tf.matmul(self.x2,self.W3.w) + self.b3
         
         
 
         self.x1_target = tf.nn.softplus(tf.matmul(self.state_input,self.W1_target) + self.b1_target)
         self.x2_target = tf.nn.tanh(tf.matmul(self.x1_target,self.W2_target) + self.b2_target)
         self.action_output_target = tf.matmul(self.x2_target,self.W3_target) + self.b3_target
         
         self.action_gradient = tf.placeholder(tf.float32, [None, action_size])
         self.params = [self.W1.v,self.W1.g, self.W2.v,self.W2.g, self.W3.v,self.W3.g, self.b1, self.b2, self.b3]
         self.params_grad = tf.gradients(self.action_output, self.params, -self.action_gradient) 
         
         self.adam = tf.train.AdamOptimizer(simple_actor_network.learning_rate)        
         #self.optimizer = tf.train.GradientDescentOptimizer(simple_actor_network.learning_rate)
         self.updater = self.adam.apply_gradients(zip(self.params_grad, self.params))        
         
         init = tf.initialize_all_variables()                
         self.sess.run(init)
         
         self.sess.run([self.W1_target.assign(self.W1.w),
                        self.W2_target.assign(self.W2.w),
                        self.W3_target.assign(self.W3.w),
                        self.b1_target.assign(self.b1),
                        self.b2_target.assign(self.b2),
                        self.b3_target.assign(self.b3) ])
         
         
         self.upTargW1 = self.W1_target.assign(self.W1_target*(1-simple_actor_network.ts)+ self.W1.w*(simple_actor_network.ts))
         self.upTargW2 = self.W2_target.assign(self.W2_target*(1-simple_actor_network.ts)+ self.W2.w*(simple_actor_network.ts))        
         self.upTargW3 = self.W3_target.assign(self.W3_target*(1-simple_actor_network.ts)+ self.W3.w*(simple_actor_network.ts))
         
         self.upTargb1 = self.b1_target.assign(self.b1_target*(1-simple_actor_network.ts)+ self.b1*(simple_actor_network.ts))
         self.upTargb2 = self.b2_target.assign(self.b2_target*(1-simple_actor_network.ts)+ self.b2*(simple_actor_network.ts))
         self.upTargb3 = self.b3_target.assign(self.b3_target*(1-simple_actor_network.ts)+ self.b3*(simple_actor_network.ts))
         
         
 #        init = tf.initialize_variables([self.W1_target, self.W2_target, self.W3_target, self.b1_target, self.b2_target, self.b3_target])        
 #        
 #        self.sess.run(init)
         
         self.batch_state = []
         self.batch_actgrad = []