def get_trainer(trial, dataloader): n_layers = trial.suggest_categorical('n_layer', [2, 3, 4]) hidden_dims = [] for i in range(n_layers): hidden_dim = int( trial.suggest_loguniform('hidden_dim_{}'.format(i), 4, 256)) hidden_dims.append(hidden_dim) model = GAE(39, hidden_dims) lr = trial.suggest_loguniform('lr', 1e-6, 1e-2) optim = torch.optim.Adam(model.parameters(), lr=lr) trainer = Trainer(model, optim, dataloader) return trainer
def main(): if not os.path.exists(args.save_dir): os.makedirs(os.path.join(save_dir, 'zinc250k.png')) model = GAE(args.in_dim, args.hidden_dims) model.to(device) print('Loading data') with open(args.data_file, 'rb') as f: graphs = dill.load(f) print('Loaded {} molecules'.format(len(graphs))) train_graphs, val_graphs = train_test_split(graphs, test_size=10000) train_dataset = MolDataset(train_graphs) val_dataset = MolDataset(val_graphs) del train_graphs, val_graphs train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, collate_fn=collate) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collate) trainer = Trainer(model, args) train_losses, val_losses = [], [] print('Training Start') for epoch in tqdm(range(args.n_epochs)): train_loss = 0 model.train() for bg in tqdm(train_loader): bg.set_e_initializer(dgl.init.zero_initializer) bg.set_n_initializer(dgl.init.zero_initializer) train_loss += trainer.iteration(bg) train_loss /= len(train_loader) train_losses.append(train_loss) trainer.save(epoch, args.save_dir) val_loss = 0 model.eval() for bg in val_loader: bg.set_e_initializer(dgl.init.zero_initializer) bg.set_n_initializer(dgl.init.zero_initializer) val_loss += trainer.iteration(bg, train=False) val_loss /= len(val_loader) val_losses.append(val_loss) print('Epoch: {:02d} | Train Loss: {:.4f} | Validation Loss: {:.4f}'. format(epoch, train_loss, val_loss)) plot(train_losses, val_losses)
def main(): if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) # TODO: train test split # load and preprocess dataset data = load_data(args) features = torch.FloatTensor(data.features) in_feats = features.shape[1] print(features.shape) model = GAE(in_feats, [32,16]) model.train() optim = torch.optim.Adam(model.parameters(), lr=1e-2) loss_function = BCELoss g = DGLGraph(data.graph) g.ndata['h'] = features n_epochs = 500 losses = [] print('Training Start') for epoch in tqdm(range(n_epochs)): g.ndata['h'] = features # normalization degs = g.in_degrees().float() norm = torch.pow(degs, -0.5) norm[torch.isinf(norm)] = 0 g.ndata['norm'] = norm.unsqueeze(1) adj = g.adjacency_matrix().to_dense() pos_weight = torch.Tensor([float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()]) adj_logits = model.forward(g)#, features) loss = loss_function(adj_logits, adj, pos_weight=pos_weight) optim.zero_grad() loss.backward() optim.step() losses.append(loss.item()) print('Epoch: {:02d} | Loss: {:.5f}'.format(epoch, loss)) plt.plot(losses) plt.xlabel('iteration') plt.ylabel('train loss') plt.grid() plt.show()
def build_model(self): self.obs = tf.placeholder(tf.float32, [None, self.observation_size]) self.action = tf.placeholder(tf.float32, [None, self.action_size]) self.advantage = tf.placeholder(tf.float32, [None]) #Mean of old action distribution self.old_action_dist_mu = tf.placeholder(tf.float32, [None, self.action_size]) self.old_action_dist_logstd = tf.placeholder(tf.float32, [None, self.action_size]) #NN framework for action distribution self.action_dist_mu, action_dist_logstd = self.build_policy(self.obs) # Get trainable variables for the policy (NN weights) tr_vrbs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Policy') for i in tr_vrbs: print(i.op.name) #Construct distribution by repeating action_dis_logstd self.action_dist_logstd = tf.tile(action_dist_logstd, (tf.shape(action_dist_logstd)[0], 1)) #Probability of action under old policy vs. new policy self.log_policy = LOG_POLICY(self.action_dist_mu, self.action_dist_logstd, self.action) self.log_old_policy = LOG_POLICY(self.old_action_dist_mu, self.old_action_dist_logstd, self.action) policy_ratio = tf.exp(self.log_policy - self.log_old_policy) #Number of observations in batch batch_size = tf.cast(tf.shape(self.obs)[0], tf.float32) ''' Equation (14) in paper Contribution of a single s_n : Expectation over a~q[ (new policy / q(is)) * advantage_old] ''' surr_single_state = -tf.reduce_mean(policy_ratio * self.advantage) #Define KL divergence and shannon entropy, averaged over a set of inputs (policies) kl = GAUSS_KL(self.old_action_dist_mu, self.old_action_dist_logstd, self.action_dist_mu, self.action_dist_logstd) / batch_size ent = GAUSS_ENTROPY(self.action_dist_mu, self.action_dist_logstd) / batch_size #Define 'loss' quantities to constrain or maximize self.losses = [surr_single_state, kl, ent] # Maximize surrogate function over policy parameter 'theta' represented by neural network weights self.pg = FLAT_GRAD(surr_single_state, tr_vrbs) #KL divergence where first argument is fixed kl_first_fixed = GAUSS_KL_FIRST_FIX( self.action_dist_mu, self.action_dist_logstd) / batch_size #Gradient of KL divergence w.r.t. theta (NN policy weights) first_kl_grads = tf.gradients(kl_first_fixed, tr_vrbs) self.flat_tangent = tf.placeholder(tf.float32, [None]) tangent = list() start = 0 for vrbs in tr_vrbs: variable_size = np.prod(vrbs.get_shape().as_list()) param = tf.reshape( self.flat_tangent[start:(start + variable_size)], vrbs.get_shape()) tangent.append(param) start += variable_size ''' Gradient of KL with tangent vector gradient_w_tangent : list of KL_prime*y for each variables ''' gradient_w_tangent = [ tf.reduce_sum(kl_g * t) for (kl_g, t) in zip(first_kl_grads, tangent) ] ''' From derivative of KL_prime*y : [dKL/dx1, dKL/dx2...]*y y -> Ay, A is n by n matrix but hard to implement(numerically solving (n*n)*(n*1)) so first multiply target 'y' to gradient and take derivation 'self.FVP' Returns : [d2KL/dx1dx1+d2KL/dx1dx2..., d2KL/dx1dx2+d2KL/dx2dx2..., ...]*y So get (second derivative of KL divergence)*y for each variable => y->JMJy (Fisher Vector Product) ''' self.FVP = FLAT_GRAD(gradient_w_tangent, tr_vrbs) #Get actual parameter value self.get_value = GetValue(self.sess, tr_vrbs, name='Policy') #Set parameter values self.set_value = SetValue(self.sess, tr_vrbs, name='Policy') #Estimate of the advantage function self.gae = GAE(self.sess, self.observation_size, self.args.gamma, self.args.lamda, self.args.vf_constraint) #Intialization of the barrier function compensator self.bar_comp = BARRIER(self.args, self.sess, self.observation_size, self.action_size) #Variable initializers self.sess.run(tf.global_variables_initializer())
class TRPO(): def __init__(self, args, env, sess): self.args = args self.sess = sess self.env = env self.firstIter = 1 self.torque_bound = 100 #Determine dimensions of observation & action space self.observation_size = 15 self.action_size = 1 # Build neural network model for observations/actions self.build_model() # Build barrier function model cbf.build_barrier(self) # Build GP model dynamics_gp.build_GP_model(self) # Build RL policy improvement model based on TRPO def build_model(self): self.obs = tf.placeholder(tf.float32, [None, self.observation_size]) self.action = tf.placeholder(tf.float32, [None, self.action_size]) self.advantage = tf.placeholder(tf.float32, [None]) #Mean of old action distribution self.old_action_dist_mu = tf.placeholder(tf.float32, [None, self.action_size]) self.old_action_dist_logstd = tf.placeholder(tf.float32, [None, self.action_size]) #NN framework for action distribution self.action_dist_mu, action_dist_logstd = self.build_policy(self.obs) # Get trainable variables for the policy (NN weights) tr_vrbs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Policy') for i in tr_vrbs: print(i.op.name) #Construct distribution by repeating action_dis_logstd self.action_dist_logstd = tf.tile(action_dist_logstd, (tf.shape(action_dist_logstd)[0], 1)) #Probability of action under old policy vs. new policy self.log_policy = LOG_POLICY(self.action_dist_mu, self.action_dist_logstd, self.action) self.log_old_policy = LOG_POLICY(self.old_action_dist_mu, self.old_action_dist_logstd, self.action) policy_ratio = tf.exp(self.log_policy - self.log_old_policy) #Number of observations in batch batch_size = tf.cast(tf.shape(self.obs)[0], tf.float32) ''' Equation (14) in paper Contribution of a single s_n : Expectation over a~q[ (new policy / q(is)) * advantage_old] ''' surr_single_state = -tf.reduce_mean(policy_ratio * self.advantage) #Define KL divergence and shannon entropy, averaged over a set of inputs (policies) kl = GAUSS_KL(self.old_action_dist_mu, self.old_action_dist_logstd, self.action_dist_mu, self.action_dist_logstd) / batch_size ent = GAUSS_ENTROPY(self.action_dist_mu, self.action_dist_logstd) / batch_size #Define 'loss' quantities to constrain or maximize self.losses = [surr_single_state, kl, ent] # Maximize surrogate function over policy parameter 'theta' represented by neural network weights self.pg = FLAT_GRAD(surr_single_state, tr_vrbs) #KL divergence where first argument is fixed kl_first_fixed = GAUSS_KL_FIRST_FIX( self.action_dist_mu, self.action_dist_logstd) / batch_size #Gradient of KL divergence w.r.t. theta (NN policy weights) first_kl_grads = tf.gradients(kl_first_fixed, tr_vrbs) self.flat_tangent = tf.placeholder(tf.float32, [None]) tangent = list() start = 0 for vrbs in tr_vrbs: variable_size = np.prod(vrbs.get_shape().as_list()) param = tf.reshape( self.flat_tangent[start:(start + variable_size)], vrbs.get_shape()) tangent.append(param) start += variable_size ''' Gradient of KL with tangent vector gradient_w_tangent : list of KL_prime*y for each variables ''' gradient_w_tangent = [ tf.reduce_sum(kl_g * t) for (kl_g, t) in zip(first_kl_grads, tangent) ] ''' From derivative of KL_prime*y : [dKL/dx1, dKL/dx2...]*y y -> Ay, A is n by n matrix but hard to implement(numerically solving (n*n)*(n*1)) so first multiply target 'y' to gradient and take derivation 'self.FVP' Returns : [d2KL/dx1dx1+d2KL/dx1dx2..., d2KL/dx1dx2+d2KL/dx2dx2..., ...]*y So get (second derivative of KL divergence)*y for each variable => y->JMJy (Fisher Vector Product) ''' self.FVP = FLAT_GRAD(gradient_w_tangent, tr_vrbs) #Get actual parameter value self.get_value = GetValue(self.sess, tr_vrbs, name='Policy') #Set parameter values self.set_value = SetValue(self.sess, tr_vrbs, name='Policy') #Estimate of the advantage function self.gae = GAE(self.sess, self.observation_size, self.args.gamma, self.args.lamda, self.args.vf_constraint) #Intialization of the barrier function compensator self.bar_comp = BARRIER(self.args, self.sess, self.observation_size, self.action_size) #Variable initializers self.sess.run(tf.global_variables_initializer()) #Train TRPO policy def train(self, iteration): batch_path = self.rollout() theta_prev = self.get_value() #Get advantage from gae (train value function NN) advantage_estimated = self.gae.get_advantage(batch_path) #Get barrier compensator from barrier_comp (train compensator NN) if (iteration < 10): self.bar_comp.get_training_rollouts(batch_path) barr_loss = self.bar_comp.train() else: barr_loss = 0. #Put all paths in batch in a numpy array to feed to network as [batch size, action/observation size] #Those batches come from OLD policy before updating theta #action_dist_mu = np.squeeze(np.concatenate([each_path["Action_mu"] for each_path in batch_path])) action_dist_mu = np.squeeze( np.concatenate( [each_path["Action_RL_mu"] for each_path in batch_path])) action_dist_logstd = np.squeeze( np.concatenate( [each_path["Action_logstd"] for each_path in batch_path])) observation = np.squeeze( np.concatenate( [each_path["Observation"] for each_path in batch_path])) action = np.squeeze( np.concatenate( [each_path["Action_RL"] for each_path in batch_path])) #action = np.squeeze(np.concatenate([each_path["Action"] for each_path in batch_path])) #Obtain policy gradient of advantage function w.r.t. theta (g in paper) feed_dict = { self.obs: observation, self.action: np.expand_dims(action, axis=1), self.advantage: advantage_estimated, self.old_action_dist_mu: np.expand_dims(action_dist_mu, axis=1), self.old_action_dist_logstd: np.expand_dims(action_dist_logstd, axis=1) } #feed_dict = {self.obs:observation, self.action:action, self.advantage:advantage_estimated, self.old_action_dist_mu:action_dist_mu, self.old_action_dist_logstd:action_dist_logstd} policy_g = self.sess.run(self.pg, feed_dict=feed_dict) # Computing fisher vector product : FIM * (policy gradient) where FIM = Fisher Information Matrix def fisher_vector_product(gradient): feed_dict[self.flat_tangent] = gradient return self.sess.run(self.FVP, feed_dict=feed_dict) #Solve Ax = g, where A is FIM and g is gradient of policy network, to obtain search direction for theta search_direction = CONJUGATE_GRADIENT(fisher_vector_product, -policy_g) #KL divergence approximated by 1/2*(delta_transpose)*FIM*delta #Appendix C in TRPO Paper kl_approximated = 0.5 * search_direction.dot( fisher_vector_product(search_direction)) #Calculate theta update maximal_step_length = np.sqrt(self.args.kl_constraint / kl_approximated) full_step = maximal_step_length * search_direction #Reverse gradient direction #full_step = -maximal_step_length * search_direction def surrogate(theta): self.set_value(theta) return self.sess.run(self.losses[0], feed_dict=feed_dict) #Use line search to ensure improvement of surrogate objective and satisfaction of KL constraint #Start with maximal step length and exponentially shrink until objective improves new_theta = LINE_SEARCH(surrogate, theta_prev, full_step, self.args.num_backtracking, name='Surrogate loss') #Update without line search #new_theta = theta_prev + full_step #Update policy parameter theta self.set_value(new_theta, update_info=0) #Update value function neural network #Policy update is performed using old value function parameter self.gae.train() #After update, store values at log surrogate_after, kl_after, _ = self.sess.run(self.losses, feed_dict=feed_dict) logs = {"Surrogate loss": surrogate_after, "KL_DIV": kl_after} logs["Total Step"] = sum([len(path["Reward"]) for path in batch_path]) logs["Num episode"] = len([path["Reward"] for path in batch_path]) logs["Total Sum"] = sum([sum(path["Reward"]) for path in batch_path]) logs["Episode_Avg_Reward"] = logs["Total Sum"] / logs["Num episode"] logs["Compensator_Fit"] = barr_loss logs["Final_Action"] = np.squeeze( np.concatenate([each_path["Action"] for each_path in batch_path])) logs["Action_bar"] = np.squeeze( np.concatenate( [each_path["Action_bar"] for each_path in batch_path])) logs["Action_BAR"] = np.squeeze( np.concatenate( [each_path["Action_BAR"] for each_path in batch_path])) logs["Observation"] = np.squeeze( np.concatenate( [each_path["Observation"] for each_path in batch_path])) logs["Reward"] = np.squeeze( np.concatenate([each_path["Reward"] for each_path in batch_path])) logs["Done"] = np.squeeze( np.concatenate([each_path["Done"] for each_path in batch_path])) return logs #Set up NN to parameterize the control policy def build_policy(self, states, name='Policy'): print('Initializing Policy network') with tf.variable_scope(name, reuse=tf.AUTO_REUSE): h1 = LINEAR(states, self.args.hidden_size, name='h1') h1_n1 = tf.nn.sigmoid(h1) h2 = LINEAR(h1_n1, self.args.hidden_size, name='h2') h2_n1 = tf.nn.sigmoid(h2) h3 = LINEAR(h2_n1, self.action_size, name='h3') #Initialize action std_deviation #init = lambda shape, dtype, partition_info=None : 0.01*np.random.randn(*shape) #action_dist_logstd = tf.get_variable('logstd', initializer=init, shape=[1, self.action_size]) #Initialize action std_deviation (no variance -- deterministic policy) action_dist_logstd = tf.get_variable( 'logstd', initializer=tf.constant_initializer(0), shape=[1, self.action_size]) return h3, action_dist_logstd #Get action from the current observation (sampled based on NN policy) def act(self, obs): #Need to expand first dimension (batch axis), make [1, observation size] obs_expanded = np.expand_dims(np.squeeze(obs), 0) #obs_expanded = obs #Get action distribution from policy network action_dist_mu, action_dist_logstd = self.sess.run( [self.action_dist_mu, self.action_dist_logstd], feed_dict={self.obs: obs_expanded}) #Sample action from gaussian distribution action = np.random.normal(loc=action_dist_mu, scale=np.exp(action_dist_logstd)) return action, action_dist_mu, action_dist_logstd #Simulate dynamics for a given rollout def rollout(self): #Initialize variables paths = list() timesteps = 0 self.num_epi = 0 #Utilize GP from previous iteration while training current iteration if (self.firstIter == 1): pass else: self.GP_model_prev = self.GP_model.copy() dynamics_gp.build_GP_model(self) #Iterate through the specified number of episodes while timesteps < self.args.timesteps_per_batch: self.num_epi += 1 #Reset the environment obs, action, rewards, done, action_dist_mu, action_dist_logstd, action_bar, action_BAR, action_RL_mu_, action_RL_ = [], [], [], [], [], [], [], [], [], [] prev_obs = self.env.reset() obs = np.expand_dims(np.squeeze(prev_obs), 0) #Simulate dynamics for specified time for i in range(self.args.max_path_length): #self.env.render() prev_obs_expanded = np.expand_dims(np.squeeze(prev_obs), 0) #prev_obs_expanded = prev_obs #Agent takes actions from sampled action and action distribution parameters based on observation #All have shape of [1, action size] action_rl, action_dist_mu_rl, action_dist_logstd_ = self.act( prev_obs) #Utilize compensation barrier function u_BAR_ = self.bar_comp.get_action(prev_obs) action_RL = action_rl + u_BAR_ action_dist_mu_RL = action_dist_mu_rl + u_BAR_ t = 0.05 * i # Get GP dynamics if (self.firstIter == 1): [f, g, x, std ] = dynamics_gp.get_GP_dynamics(self, prev_obs_expanded, action_RL, t) else: [f, g, x, std] = dynamics_gp.get_GP_dynamics_prev( self, prev_obs_expanded, action_RL, t) #Utilize safety barrier function u_bar_ = cbf.control_barrier(self, np.squeeze(prev_obs_expanded), action_dist_mu_RL, f, g, x, std) #action_ = action_RL + u_bar_ action_dist_mu_ = action_dist_mu_RL + u_bar_ #Stochastic action action_ = np.random.normal(loc=action_dist_mu_, scale=np.exp(action_dist_logstd_)) #Store observation and action/distribution obs = np.append(obs, prev_obs_expanded, axis=0) action_RL_mu_.append(action_dist_mu_rl) action_RL_.append(action_rl) action_bar.append(u_bar_) action_BAR.append(u_BAR_) action.append(action_) action_dist_mu.append(action_dist_mu_) action_dist_logstd.append(action_dist_logstd_) # Simulate dynamics after action next_obs, reward_, done_ = self.env.step(action_) reward_ = np.squeeze(reward_) #next_obs, reward_, done_, _ = self.env.step(action_) #Get results done.append(done_) rewards.append(reward_) prev_obs = next_obs if i == self.args.max_path_length - 1: obs = obs[1:self.args.max_path_length + 1, :] path = { "Observation": obs, "Action": np.concatenate(action), "Action_RL_mu": np.concatenate(action_RL_mu_), "Action_RL": np.concatenate(action_RL_), "Action_mu": np.concatenate(action_dist_mu), "Action_bar": np.concatenate(action_bar), "Action_BAR": np.concatenate(action_BAR), "Action_logstd": np.concatenate(action_dist_logstd), "Done": np.asarray(done), "Reward": np.asarray(rewards) } paths.append(path) break #For timing purposes, only update GP dynamics for certain number of timesteps if (timesteps < 500): dynamics_gp.update_GP_dynamics(self, path) timesteps += len(rewards) #print('%d episodes, %d steps collected for batch' % (self.num_epi, timesteps)) self.firstIter = 0 return paths
class TRPO(): def __init__(self, args, env, sess): self.args = args self.sess = sess self.env = env self.torque_bound = 8 #Set up observation space and action space self.observation_space = env.observation_space self.action_space = env.action_space print('Observation space', self.observation_space) print('Action space', self.action_space) #Determine dimensions of observation & action space self.observation_size = self.env.observation_space.shape[0] self.action_size = self.action_space.shape[0] # Build neural network model for observations/actions self.build_model() # Build barrier function model self.build_barrier() #Build barrier function model def build_barrier(self): N = self.action_size #self.P = matrix(np.eye(N), tc='d') self.P = matrix(np.diag([1., 10000000.]), tc='d') self.q = matrix(np.zeros(N + 1)) self.H1 = np.array([1, 0.001]) self.H2 = np.array([1, -0.001]) self.H3 = np.array([-1, 0.001]) self.H4 = np.array([-1, -0.001]) self.F = 1 # Build RL policy improvement model based on TRPO def build_model(self): self.obs = tf.placeholder(tf.float32, [None, self.observation_size]) self.action = tf.placeholder(tf.float32, [None, self.action_size]) self.advantage = tf.placeholder(tf.float32, [None]) #Mean of old action distribution self.old_action_dist_mu = tf.placeholder(tf.float32, [None, self.action_size]) self.old_action_dist_logstd = tf.placeholder(tf.float32, [None, self.action_size]) #NN framework for action distribution self.action_dist_mu, action_dist_logstd = self.build_policy(self.obs) # Get trainable variables for the policy (NN weights) tr_vrbs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Policy') for i in tr_vrbs: print(i.op.name) #Construct distribution by repeating action_dis_logstd self.action_dist_logstd = tf.tile(action_dist_logstd, (tf.shape(action_dist_logstd)[0], 1)) #Probability of action under old policy vs. new policy self.log_policy = LOG_POLICY(self.action_dist_mu, self.action_dist_logstd, self.action) self.log_old_policy = LOG_POLICY(self.old_action_dist_mu, self.old_action_dist_logstd, self.action) policy_ratio = tf.exp(self.log_policy - self.log_old_policy) #Number of observations in batch batch_size = tf.cast(tf.shape(self.obs)[0], tf.float32) ''' Equation (14) in paper Contribution of a single s_n : Expectation over a~q[ (new policy / q(is)) * advantage_old] ''' surr_single_state = -tf.reduce_mean(policy_ratio * self.advantage) #Define KL divergence and shannon entropy, averaged over a set of inputs (policies) kl = GAUSS_KL(self.old_action_dist_mu, self.old_action_dist_logstd, self.action_dist_mu, self.action_dist_logstd) / batch_size ent = GAUSS_ENTROPY(self.action_dist_mu, self.action_dist_logstd) / batch_size #Define 'loss' quantities to constrain or maximize self.losses = [surr_single_state, kl, ent] # Maximize surrogate function over policy parameter 'theta' represented by neural network weights self.pg = FLAT_GRAD(surr_single_state, tr_vrbs) #KL divergence where first argument is fixed kl_first_fixed = GAUSS_KL_FIRST_FIX( self.action_dist_mu, self.action_dist_logstd) / batch_size #Gradient of KL divergence w.r.t. theta (NN policy weights) first_kl_grads = tf.gradients(kl_first_fixed, tr_vrbs) ''' REVIEW FROM HERE ONWARDS #?????????????????????????????????????????????????????????? ''' self.flat_tangent = tf.placeholder(tf.float32, [None]) tangent = list() start = 0 for vrbs in tr_vrbs: variable_size = np.prod(vrbs.get_shape().as_list()) param = tf.reshape( self.flat_tangent[start:(start + variable_size)], vrbs.get_shape()) tangent.append(param) start += variable_size ''' Gradient of KL with tangent vector gradient_w_tangent : list of KL_prime*y for each variables ''' gradient_w_tangent = [ tf.reduce_sum(kl_g * t) for (kl_g, t) in zip(first_kl_grads, tangent) ] ''' From derivative of KL_prime*y : [dKL/dx1, dKL/dx2...]*y y -> Ay, A is n by n matrix but hard to implement(numerically solving (n*n)*(n*1)) so first multiply target 'y' to gradient and take derivation 'self.FVP' Returns : [d2KL/dx1dx1+d2KL/dx1dx2..., d2KL/dx1dx2+d2KL/dx2dx2..., ...]*y So get (second derivative of KL divergence)*y for each variable => y->JMJy (Fisher Vector Product) ''' self.FVP = FLAT_GRAD(gradient_w_tangent, tr_vrbs) #Get actual parameter value self.get_value = GetValue(self.sess, tr_vrbs, name='Policy') #Set parameter values self.set_value = SetValue(self.sess, tr_vrbs, name='Policy') #Estimate of the advantage function self.gae = GAE(self.sess, self.observation_size, self.args.gamma, self.args.lamda, self.args.vf_constraint) #Intialization of the barrier function compensator self.bar_comp = BARRIER(self.args, self.sess, self.observation_size, self.action_size) #Variable initializers self.sess.run(tf.global_variables_initializer()) #Train TRPO policy def train(self): batch_path = self.rollout() theta_prev = self.get_value() #Get advantage from gae (train value function NN) advantage_estimated = self.gae.get_advantage(batch_path) #Get barrier compensator from barrier_comp (train compensator NN) self.bar_comp.get_training_rollouts(batch_path) barr_loss = self.bar_comp.train() #Put all paths in batch in a numpy array to feed to network as [batch size, action/observation size] #Those batches come from OLD policy before updating theta action_dist_mu = np.squeeze( np.concatenate( [each_path["Action_mu"] for each_path in batch_path])) action_dist_logstd = np.squeeze( np.concatenate( [each_path["Action_logstd"] for each_path in batch_path])) observation = np.squeeze( np.concatenate( [each_path["Observation"] for each_path in batch_path])) action = np.squeeze( np.concatenate([each_path["Action"] for each_path in batch_path])) #Obtain policy gradient of advantage function w.r.t. theta (g in paper) feed_dict = { self.obs: observation, self.action: np.expand_dims(action, axis=1), self.advantage: advantage_estimated, self.old_action_dist_mu: np.expand_dims(action_dist_mu, axis=1), self.old_action_dist_logstd: np.expand_dims(action_dist_logstd, axis=1) } #feed_dict = {self.obs:observation, self.action:action, self.advantage:advantage_estimated, self.old_action_dist_mu:action_dist_mu, self.old_action_dist_logstd:action_dist_logstd} policy_g = self.sess.run(self.pg, feed_dict=feed_dict) # Computing fisher vector product : FIM * (policy gradient) where FIM = Fisher Information Matrix def fisher_vector_product(gradient): feed_dict[self.flat_tangent] = gradient return self.sess.run(self.FVP, feed_dict=feed_dict) #Solve Ax = g, where A is FIM and g is gradient of policy network, to obtain search direction for theta search_direction = CONJUGATE_GRADIENT(fisher_vector_product, -policy_g) #KL divergence approximated by 1/2*(delta_transpose)*FIM*delta #Appendix C in TRPO Paper kl_approximated = 0.5 * search_direction.dot( fisher_vector_product(search_direction)) #Calculate theta update maximal_step_length = np.sqrt(self.args.kl_constraint / kl_approximated) full_step = maximal_step_length * search_direction def surrogate(theta): self.set_value(theta) return self.sess.run(self.losses[0], feed_dict=feed_dict) #Use line search to ensure improvement of surrogate objective and satisfaction of KL constraint #Start with maximal step length and exponentially shrink until objective improves new_theta = LINE_SEARCH(surrogate, theta_prev, full_step, self.args.num_backtracking, name='Surrogate loss') #Update without line search #new_theta = theta_prev + full_step #Update policy parameter theta self.set_value(new_theta, update_info=0) #Update value function neural network #Policy update is performed using old value function parameter self.gae.train() #After update, store values at log surrogate_after, kl_after, _ = self.sess.run(self.losses, feed_dict=feed_dict) logs = {"Surrogate loss": surrogate_after, "KL_DIV": kl_after} logs["Total Step"] = sum([len(path["Reward"]) for path in batch_path]) logs["Num episode"] = len([path["Reward"] for path in batch_path]) logs["Total Sum"] = sum([sum(path["Reward"]) for path in batch_path]) logs["Episode Avg. Reward"] = logs["Total Sum"] / logs["Num episode"] logs["Compensator_Fit"] = barr_loss logs["Final_Action"] = np.squeeze( np.concatenate([each_path["Action"] for each_path in batch_path])) logs["Action_bar"] = np.squeeze( np.concatenate( [each_path["Action_bar"] for each_path in batch_path])) logs["Action_BAR"] = np.squeeze( np.concatenate( [each_path["Action_BAR"] for each_path in batch_path])) logs["Observation"] = np.squeeze( np.concatenate( [each_path["Observation"] for each_path in batch_path])) logs["Reward"] = np.squeeze( np.concatenate([each_path["Reward"] for each_path in batch_path])) return logs #Set up NN to parameterize the control policy def build_policy(self, states, name='Policy'): print('Initializing Policy network') with tf.variable_scope(name, reuse=tf.AUTO_REUSE): h1 = LINEAR(states, self.args.hidden_size, name='h1') h1_n1 = tf.nn.relu(h1) h2 = LINEAR(h1_n1, self.args.hidden_size, name='h2') h2_n1 = tf.nn.relu(h2) h3 = LINEAR(h2_n1, self.action_size, name='h3') #Initialize action std_deviation #init = lambda shape, dtype, partition_info=None : 0.01*np.random.randn(*shape) #action_dist_logstd = tf.get_variable('logstd', initializer=init, shape=[1, self.action_size]) #Initialize action std_deviation (no variance -- deterministic policy) action_dist_logstd = tf.get_variable( 'logstd', initializer=tf.constant_initializer(-1.5), shape=[1, self.action_size]) return h3, action_dist_logstd #Get action from the current observation (sampled based on NN policy) def act(self, obs): #Need to expand first dimension (batch axis), make [1, observation size] obs_expanded = np.expand_dims(np.squeeze(obs), 0) #obs_expanded = obs #Get action distribution from policy network action_dist_mu, action_dist_logstd = self.sess.run( [self.action_dist_mu, self.action_dist_logstd], feed_dict={self.obs: obs_expanded}) #Sample action from gaussian distribution action = np.random.normal(loc=action_dist_mu, scale=np.exp(action_dist_logstd)) return action, action_dist_mu, action_dist_logstd #Get compensatory action based on satisfaction of barrier function def control_barrier(self, obs, u_rl): #Define gamma for the barrier function gamma_b = 0.5 #Get the dynamics of the system from the current time step with the RL action def get_dynamics(obs, u_rl): dt = 0.05 G = 10 m = 1 l = 1 obs = np.squeeze(obs) theta = np.arctan2(obs[1], obs[0]) theta_dot = obs[2] f = np.array([ -3 * G / (2 * l) * np.sin(theta + np.pi) * dt**2 + theta_dot * dt + theta + 3 / (m * l**2) * u_rl * dt**2, theta_dot - 3 * G / (2 * l) * np.sin(theta + np.pi) * dt + 3 / (m * l**2) * u_rl * dt ]) g = np.array([3 / (m * l**2) * dt**2, 3 / (m * l**2) * dt]) x = np.array([theta, theta_dot]) return [np.squeeze(f), np.squeeze(g), np.squeeze(x)] [f, g, x] = get_dynamics(obs, u_rl) #Set up Quadratic Program to satisfy the Control Barrier Function G = np.array([[ np.dot(self.H1, g), np.dot(self.H2, g), np.dot(self.H3, g), np.dot(self.H4, g), 1., -1. ], [1, 1, 1, 1, 0, 0]]) G = np.transpose(G) h = np.array([ gamma_b * self.F - np.dot(self.H1, f) + (1 - gamma_b) * np.dot(self.H1, x), gamma_b * self.F - np.dot(self.H2, f) + (1 - gamma_b) * np.dot(self.H2, x), gamma_b * self.F - np.dot(self.H3, f) + (1 - gamma_b) * np.dot(self.H3, x), gamma_b * self.F - np.dot(self.H4, f) + (1 - gamma_b) * np.dot(self.H4, x), self.torque_bound - u_rl, self.torque_bound + u_rl ]) #Convert numpy arrays to cvx matrices to set up QP G = matrix(G, tc='d') h = matrix(h, tc='d') solvers.options['show_progress'] = False sol = solvers.qp(self.P, self.q, G, h) u_bar = sol['x'] if (np.add(np.squeeze(u_rl), np.squeeze(u_bar[0])) - 0.001 >= self.torque_bound): u_bar[0] = self.torque_bound - u_rl print("Error in QP") elif (np.add(np.squeeze(u_rl), np.squeeze(u_bar[0])) + 0.001 <= -self.torque_bound): u_bar[0] = -self.torque_bound - u_rl print("Error in QP") else: pass return np.expand_dims(np.array(u_bar[0]), 0) #Simulate dynamics for a given rollout def rollout(self): #Initialize variables paths = list() timesteps = 0 self.num_epi = 0 #Iterate through the specified number of episodes while timesteps < self.args.timesteps_per_batch: self.num_epi += 1 #Reset the environment obs, action, rewards, done, action_dist_mu, action_dist_logstd, action_bar, action_BAR = [], [], [], [], [], [], [], [] prev_obs = self.env.reset() #Simulate dynamics for specified time for i in range(self.args.max_path_length): #self.env.render() prev_obs_expanded = np.expand_dims(np.squeeze(prev_obs), 0) #prev_obs_expanded = prev_obs #Agent takes actions from sampled action and action distribution parameters based on observation #All have shape of [1, action size] action_rl, action_dist_mu_rl, action_dist_logstd_ = self.act( prev_obs) #Utilize compensation barrier function u_BAR_ = self.bar_comp.get_action(prev_obs) action_RL = action_rl + u_BAR_ action_dist_mu_RL = action_dist_mu_rl + u_BAR_ #Utilize safety barrier function u_bar_ = self.control_barrier(np.squeeze(prev_obs_expanded), action_dist_mu_RL) #action_ = action_RL + u_bar_ action_dist_mu_ = action_dist_mu_RL + u_bar_ #Stochastic action action_ = np.random.normal(loc=action_dist_mu_, scale=np.exp(action_dist_logstd_)) #Store observation and action/distribution obs.append(prev_obs_expanded) action_bar.append(u_bar_) action_BAR.append(u_BAR_) action.append(action_) action_dist_mu.append(action_dist_mu_) action_dist_logstd.append(action_dist_logstd_) # Simulate dynamics after action next_obs, reward_, done_, _ = self.env.step(action_) #Get results done.append(done_) rewards.append(reward_) prev_obs = next_obs if done_: path = { "Observation": np.concatenate(obs), "Action": np.concatenate(action), "Action_mu": np.concatenate(action_dist_mu), "Action_bar": np.concatenate(action_bar), "Action_BAR": np.concatenate(action_BAR), "Action_logstd": np.concatenate(action_dist_logstd), "Done": np.asarray(done), "Reward": np.asarray(rewards) } paths.append(path) break timesteps += len(rewards) #print('%d episodes, %d steps collected for batch' % (self.num_epi, timesteps)) return paths #Simulate/Visualize latest policy def sim(self): observation = self.env.reset() total = 0 for t in range(600): #Render environment self.env.render() #Get action from NN policy obs_expanded = np.expand_dims(np.squeeze(observation), 0) #Get action distribution from policy network action_dist_mu, action_dist_logstd = self.sess.run( [self.action_dist_mu, self.action_dist_logstd], feed_dict={self.obs: obs_expanded}) #Sample action from gaussian distribution action_rl = np.random.normal(loc=action_dist_mu, scale=np.exp(action_dist_logstd)) #Get compensatory barrier action u_BAR_ = self.bar_comp.get_action(obs_expanded) u_RL = action_rl + u_BAR_ #Compensate with barrier-based control u_bar = self.control_barrier(obs_expanded, u_RL) action = u_bar + u_RL observation, reward, done, info = self.env.step(action) total = total + reward if done: print("Accumulated Reward: {}".format(total)) break
class TRPO(): def __init__(self, args, env, sess): self.args = args self.sess = sess self.env = env #Set up observation space and action space self.observation_space = env.observation_space self.action_space = env.action_space print('Observation space', self.observation_space) print('Action space', self.action_space) #Determine dimensions of observation & action space self.observation_size = self.env.observation_space.shape[0] self.action_size = self.action_space.shape[0] # Build neural network model for observations/actions self.build_model() def build_model(self): self.obs = tf.placeholder(tf.float32, [None, self.observation_size]) self.action = tf.placeholder(tf.float32, [None, self.action_size]) self.advantage = tf.placeholder(tf.float32, [None]) #Mean of old action distribution self.old_action_dist_mu = tf.placeholder(tf.float32, [None, self.action_size]) self.old_action_dist_logstd = tf.placeholder(tf.float32, [None, self.action_size]) #NN framework for action distribution self.action_dist_mu, action_dist_logstd = self.build_policy(self.obs) #Construct distribution by repeating action_dis_logstd self.action_dist_logstd = tf.tile(action_dist_logstd, (tf.shape(action_dist_logstd)[0],1)) #Probability of action under old policy vs. new policy self.log_policy = LOG_POLICY(self.action_dist_mu, self.action_dist_logstd, self.action) self.log_old_policy = LOG_POLICY(self.old_action_dist_mu, self.old_action_dist_logstd, self.action) policy_ratio = tf.exp(self.log_policy - self.log_old_policy) #Number of observations in batch batch_size = tf.cast(tf.shape(self.obs)[0], tf.float32) ''' Equation (14) in paper Contribution of a single s_n : Expectation over a~q[ (new policy / q(is)) * advantage_old] ''' surr_single_state = -tf.reduce_mean(policy_ratio*self.advantage) #Define KL divergence and shannon entropy, averaged over a set of inputs (policies) kl = GAUSS_KL(self.old_action_dist_mu, self.old_action_dist_logstd, self.action_dist_mu, self.action_dist_logstd) / batch_size ent = GAUSS_ENTROPY(self.action_dist_mu, self.action_dist_logstd) / batch_size #Define 'loss' quantities to constrain or maximize self.losses = [surr_single_state, kl, ent] # Get trainable variables for the policy (NN weights) tr_vrbs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Policy') for i in tr_vrbs: print(i.op.name) # Maximize surrogate function over policy parameter 'theta' represented by neural network weights self.pg = FLAT_GRAD(surr_single_state, tr_vrbs) #KL divergence where first argument is fixed kl_first_fixed = GAUSS_KL_FIRST_FIX(self.action_dist_mu, self.action_dist_logstd) / batch_size #Gradient of KL divergence w.r.t. theta (NN policy weights) first_kl_grads = tf.gradients(kl_first_fixed, tr_vrbs) ''' REVIEW FROM HERE ONWARDS #?????????????????????????????????????????????????????????? ''' self.flat_tangent = tf.placeholder(tf.float32,[None]) tangent = list() start = 0 for vrbs in tr_vrbs: variable_size = np.prod(vrbs.get_shape().as_list()) param = tf.reshape(self.flat_tangent[start:(start+variable_size)], vrbs.get_shape()) tangent.append(param) start += variable_size ''' Gradient of KL with tangent vector gradient_w_tangent : list of KL_prime*y for each variables ''' gradient_w_tangent = [tf.reduce_sum(kl_g*t) for (kl_g, t) in zip(first_kl_grads, tangent)] ''' From derivative of KL_prime*y : [dKL/dx1, dKL/dx2...]*y y -> Ay, A is n by n matrix but hard to implement(numerically solving (n*n)*(n*1)) so first multiply target 'y' to gradient and take derivation 'self.FVP' Returns : [d2KL/dx1dx1+d2KL/dx1dx2..., d2KL/dx1dx2+d2KL/dx2dx2..., ...]*y So get (second derivative of KL divergence)*y for each variable => y->JMJy (Fisher Vector Product) ''' self.FVP = FLAT_GRAD(gradient_w_tangent, tr_vrbs) #Get actual parameter value self.get_value = GetValue(self.sess, tr_vrbs, name='Policy') #Set parameter values self.set_value = SetValue(self.sess, tr_vrbs, name='Policy') #Estimate of the advantage function self.gae = GAE(self.sess, self.observation_size, self.args.gamma, self.args.lamda, self.args.vf_constraint) self.sess.run(tf.global_variables_initializer()) def train(self): batch_path = self.rollout() theta_prev = self.get_value() #Get advantage from gae (train value function NN) advantage_estimated = self.gae.get_advantage(batch_path) #Put all paths in batch in a numpy array to feed to network as [batch size, action/observation size] #Those batches come from OLD policy before updating theta action_dist_mu = np.squeeze(np.concatenate([each_path["Action_mu"] for each_path in batch_path])) action_dist_logstd = np.squeeze(np.concatenate([each_path["Action_logstd"] for each_path in batch_path])) observation = np.squeeze(np.concatenate([each_path["Observation"] for each_path in batch_path])) action = np.squeeze(np.concatenate([each_path["Action"] for each_path in batch_path])) #Obtain policy gradient of advantage function w.r.t. theta (g in paper) feed_dict = {self.obs:observation, self.action:np.expand_dims(action, axis=1), self.advantage:advantage_estimated, self.old_action_dist_mu:np.expand_dims(action_dist_mu, axis=1), self.old_action_dist_logstd:np.expand_dims(action_dist_logstd, axis=1)} #feed_dict = {self.obs:observation, self.action:action, self.advantage:advantage_estimated, self.old_action_dist_mu:action_dist_mu, self.old_action_dist_logstd:action_dist_logstd} policy_g = self.sess.run(self.pg, feed_dict=feed_dict) # Computing fisher vector product : FIM * (policy gradient) where FIM = Fisher Information Matrix def fisher_vector_product(gradient): feed_dict[self.flat_tangent] = gradient return self.sess.run(self.FVP, feed_dict=feed_dict) #Solve Ax = g, where A is FIM and g is gradient of policy network, to obtain search direction for theta search_direction = CONJUGATE_GRADIENT(fisher_vector_product, -policy_g) #KL divergence approximated by 1/2*(delta_transpose)*FIM*delta #Appendix C in TRPO Paper kl_approximated = 0.5*search_direction.dot(fisher_vector_product(search_direction)) #Calculate theta update maximal_step_length = np.sqrt(self.args.kl_constraint / kl_approximated) full_step = maximal_step_length * search_direction def surrogate(theta): self.set_value(theta) return self.sess.run(self.losses[0], feed_dict=feed_dict) #Use line search to ensure improvement of surrogate objective and satisfaction of KL constraint #Start with maximal step length and exponentially shrink until objective improves new_theta = LINE_SEARCH(surrogate, theta_prev, full_step, self.args.num_backtracking, name='Surrogate loss') #Update without line search #new_theta = theta_prev + full_step #Update policy parameter theta self.set_value(new_theta, update_info=0) #Update value function neural network #Policy update is performed using old value function parameter self.gae.train() #After update, store values at log surrogate_after, kl_after, _ = self.sess.run(self.losses, feed_dict=feed_dict) logs = {"Surrogate loss":surrogate_after, "KL_DIV":kl_after} logs["Total Step"] = sum([len(path["Reward"]) for path in batch_path]) logs["Num episode"] = len([path["Reward"] for path in batch_path]) logs["Total Sum"] = sum([sum(path["Reward"]) for path in batch_path]) logs["Episode_Avg_Reward"] = logs["Total Sum"] / logs["Num episode"] logs["Final_Action"] = np.squeeze(np.concatenate([each_path["Action"] for each_path in batch_path])) logs["Observation"] = np.squeeze(np.concatenate([each_path["Observation"] for each_path in batch_path])) logs["Reward"] = np.squeeze(np.concatenate([each_path["Reward"] for each_path in batch_path])) return logs #USE SOFTMAX RELU INSTEAD OF RELU, OUTPUT WEIGHTS/BIASES IN EASIER FORMAT #Set up NN to parameterize the control policy def build_policy(self, states, name='Policy'): print('Initializing Policy network') with tf.variable_scope(name, reuse=tf.AUTO_REUSE): h1 = LINEAR(states, self.args.hidden_size, name='h1') #h1_n1 = tf.nn.relu(h1) h1_n1 = tf.nn.softmax(h1) h2 = LINEAR(h1_n1, self.args.hidden_size, name='h2') #h2_n1 = tf.nn.relu(h2) h2_n1 = tf.nn.softmax(h2) h3 = LINEAR(h2_n1, self.action_size, name='h3') init = lambda shape, dtype, partition_info=None : 0.01*np.random.randn(*shape) action_dist_logstd = tf.get_variable('logstd', initializer=init, shape=[1, self.action_size]) return h3, action_dist_logstd def act(self, obs): #Need to expand first dimension (batch axis), make [1, observation size] obs_expanded = np.expand_dims(np.squeeze(obs), 0) #obs_expanded = obs #Get action distribution from policy network action_dist_mu, action_dist_logstd = self.sess.run([self.action_dist_mu, self.action_dist_logstd], feed_dict={self.obs:obs_expanded}) #Sample action from gaussian distribution action = np.random.normal(loc=action_dist_mu, scale=np.exp(action_dist_logstd)) return action, action_dist_mu, action_dist_logstd def rollout(self): #Initialize variables paths = list() timesteps = 0 self.num_epi = 0 #Iterate through the specified number of episodes while timesteps < self.args.timesteps_per_batch: self.num_epi += 1 #Reset the environment obs, action, rewards, done, action_dist_mu, action_dist_logstd = [], [], [], [], [], [] prev_obs = self.env.reset() #Simulate dynamics for specified time for i in range(self.args.max_path_length): prev_obs_expanded = np.expand_dims(np.squeeze(prev_obs), 0) #prev_obs_expanded = prev_obs #Agent takes actions from sampled action and action distribution parameters based on observation #All have shape of [1, action size] action_, action_dist_mu_, action_dist_logstd_ = self.act(prev_obs) #Store observation and action/distribution obs.append(prev_obs_expanded) action.append(action_) action_dist_mu.append(action_dist_mu_) action_dist_logstd.append(action_dist_logstd_) # Simulate dynamics after action next_obs, reward_, done_, _ = self.env.step(action_) #Get results done.append(done_) rewards.append(reward_) prev_obs = next_obs if done_: path = {"Observation":np.concatenate(obs), "Action":np.concatenate(action), "Action_mu":np.concatenate(action_dist_mu), "Action_logstd":np.concatenate(action_dist_logstd), "Done":np.asarray(done), "Reward":np.asarray(rewards)} paths.append(path) break timesteps += len(rewards) #print('%d episodes, %d steps collected for batch' % (self.num_epi, timesteps)) return paths def sim(self): observation = self.env.reset() total = 0 for t in range(600): #Render environment self.env.render() #Get action from NN policy obs_expanded = np.expand_dims(np.squeeze(observation), 0) #obs_expanded = obs #Get action distribution from policy network action_dist_mu, action_dist_logstd = self.sess.run([self.action_dist_mu, self.action_dist_logstd], feed_dict={self.obs:obs_expanded}) #Sample action from gaussian distribution action = np.random.normal(loc=action_dist_mu, scale=np.exp(action_dist_logstd)) observation, reward, done, info = self.env.step(action) total = total + reward if done: print("Accumulated Reward: {}".format(total)) break
class TRPO(): def __init__(self, args, env, sess, prior): self.num_epi = 0 self.args = args self.sess = sess self.env = env self.prior = prior self.observation_space = self.env.observation_space self.action_space = self.env.action_space print('Observation space', self.observation_space) print('Action space', self.action_space) # 'Box' observation_space and 'Box' action_space self.observation_size = self.env.observation_space.shape[0] # np.prod : return the product of array element over a given axis self.action_size = self.action_space.shape[0] # Build model and create variables self.build_model() def build_model(self): self.obs = tf.placeholder(tf.float32, [None, self.observation_size]) self.action = tf.placeholder(tf.float32, [None, self.action_size]) self.advantage = tf.placeholder(tf.float32, [None]) # Mean of old action distribution self.old_action_dist_mu = tf.placeholder(tf.float32, [None, self.action_size]) self.old_action_dist_logstd = tf.placeholder(tf.float32, [None, self.action_size]) ''' Mean value for each action : each action has gaussian distribution with mean and standard deviation With continuous state and action space, use GAUSSIAN DISTRIBUTION, maps from the input features to the mean of Gaussian distribution for each action Seperate set of parameters specifies the log standard deviation of each action => The policy is defined by the normnal distribution (mean=NeuralNet(states), stddev= exp(r)) ''' self.action_dist_mu, action_dist_logstd = self.build_policy(self.obs) # Make log standard shape from [1, action size] => [batch size, action size] # tf.tile(A, reps) : construct an tensor by repeating A given by 'reps' # Use tf.shape instead of tf.get_shape() when 'None' used in placeholder self.action_dist_logstd = tf.tile(action_dist_logstd, (tf.shape(action_dist_logstd)[0], 1)) # outputs probability of taking 'self.action' # new distribution self.log_policy = LOG_POLICY(self.action_dist_mu, self.action_dist_logstd, self.action) # old distribution self.log_old_policy = LOG_POLICY(self.old_action_dist_mu, self.old_action_dist_logstd, self.action) # Take exponential to log policy distribution ''' Equation (14) in paper Contribution of a single s_n : Expectation over a~q[(new policy / q(is)) * advantace_old] sampling distribution q is normally old policy ''' batch_size = tf.shape(self.obs)[0] # print('Batch size %d' % batch_size) policy_ratio = tf.exp(self.log_policy - self.log_old_policy) surr_single_state = -tf.reduce_mean(policy_ratio * self.advantage) # tf.shape returns dtype=int32, tensor conversion requested dtype float32 batch_size = tf.cast(batch_size, tf.float32) # Average KL divergence and shannon entropy, averaged over a set of inputs to function mu kl = GAUSS_KL(self.old_action_dist_mu, self.old_action_dist_logstd, self.action_dist_mu, self.action_dist_logstd) / batch_size ent = GAUSS_ENTROPY(self.action_dist_mu, self.action_dist_logstd) / batch_size self.losses = [surr_single_state, kl, ent] #tr_vrbs = tf.trainable_variables() tr_vrbs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Policy') for i in tr_vrbs: print(i.op.name) ''' Compute a search direction using a linear approx to objective and quadratic approx to constraint => The search direction is computed by approximately solving 'Ax=g' where A is FIM Quadratic approximation to KL divergence constraint ''' # Maximize surrogate function over policy parameter 'theta' self.pg = FLAT_GRAD(surr_single_state, tr_vrbs) # KL divergence where first argument is fixed # First argument would be old policy parameters, so keep it constant kl_first_fixed = GAUSS_KL_FIRST_FIX(self.action_dist_mu, self.action_dist_logstd) / batch_size # Gradient of KL divergence first_kl_grads = tf.gradients(kl_first_fixed, tr_vrbs) # Vectors we are going to multiply self.flat_tangent = tf.placeholder(tf.float32, [None]) tangent = list() start = 0 for vrbs in tr_vrbs: variable_size = np.prod(vrbs.get_shape().as_list()) param = tf.reshape(self.flat_tangent[start:(start+variable_size)], vrbs.get_shape()) tangent.append(param) start += variable_size ''' Gradient of KL with tangent vector gradient_w_tangent : list of KL_prime*y for each variables ''' gradient_w_tangent = [tf.reduce_sum(kl_g*t) for (kl_g, t) in zip(first_kl_grads, tangent)] ''' From derivative of KL_prime*y : [dKL/dx1, dKL/dx2...]*y y -> Ay, A is n by n matrix but hard to implement(numerically solving (n*n)*(n*1)) so first multiply target 'y' to gradient and take derivation 'self.FVP' Returns : [d2KL/dx1dx1+d2KL/dx1dx2..., d2KL/dx1dx2+d2KL/dx2dx2..., ...]*y So get (second derivative of KL divergence)*y for each variable => y->JMJy (Fisher Vector Product) ''' self.FVP = FLAT_GRAD(gradient_w_tangent, tr_vrbs) # Get actual paramenter value self.get_value = GetValue(self.sess, tr_vrbs, name='Policy') # To set parameter values self.set_value = SetValue(self.sess, tr_vrbs, name='Policy') # GAE self.gae = GAE(self.sess, self.observation_size, self.args.gamma, self.args.lamda, self.args.vf_constraint) self.sess.run(tf.global_variables_initializer()) def train(self): batch_path = self.rollout() theta_prev = self.get_value() # Get advantage from gae advantage_estimated = self.gae.get_advantage(batch_path) # Put all paths in batch in a numpy array to feed to network as [batch size, action/observation size] # Those batches come from old policy before update theta action_dist_mu = np.squeeze(np.concatenate([each_path["Action_mu"] for each_path in batch_path])) action_dist_logstd = np.squeeze(np.concatenate([each_path["Action_logstd"] for each_path in batch_path])) observation = np.squeeze(np.concatenate([each_path["Observation"] for each_path in batch_path])) action = np.squeeze(np.concatenate([each_path["Action"] for each_path in batch_path])) feed_dict = {self.obs : observation , self.action : np.expand_dims(np.squeeze(action),1), self.advantage : advantage_estimated, self.old_action_dist_mu : np.expand_dims(np.squeeze(action_dist_mu),1), self.old_action_dist_logstd : np.expand_dims(np.squeeze(action_dist_logstd),1)} # Computing fisher vector product : FIM * (policy gradient), y->Ay=JMJy def fisher_vector_product(gradient): feed_dict[self.flat_tangent] = gradient return self.sess.run(self.FVP, feed_dict=feed_dict) policy_g = self.sess.run(self.pg, feed_dict=feed_dict) ''' Linearize to objective function gives : objective_gradient * (theta-theta_old) = g.transpose * delta Quadratize to kl constraint : 1/2*(delta_transpose)*FIM*(delta) By Lagrangian => FIM*delta = gradient ''' # Solve Ax = g, where A is FIM and g is gradient of policy network parameter # Compute a search direction(delta) by conjugate gradient algorithm search_direction = CONJUGATE_GRADIENT(fisher_vector_product, -policy_g) # KL divergence approximated by 1/2*(delta_transpose)*FIM*(delta) # FIM*(delta) can be computed by fisher_vector_product # a.dot(b) = a.transpose * b kl_approximated = 0.5*search_direction.dot(fisher_vector_product(search_direction)) # beta maximal_step_length = np.sqrt(self.args.kl_constraint / kl_approximated) # beta*s full_step = maximal_step_length * search_direction def surrogate(theta): self.set_value(theta) return self.sess.run(self.losses[0], feed_dict=feed_dict) # Last, we use a line search to ensure improvement of the surrogate objective and sttisfaction of the KL constraint by manually control valud of parameter # Start with the maximal step length and exponentially shrink until objective improves new_theta = LINE_SEARCH(surrogate, theta_prev, full_step, self.args.num_backtracking, name='Surrogate loss') # Update policy parameter theta self.set_value(new_theta, update_info=1) # Update value function parameter # Policy update is perfomed using the old value function parameter self.gae.train() # After update, store values at log surrogate_after, kl_after, _ = self.sess.run(self.losses, feed_dict=feed_dict) logs = {"Surrogate loss":surrogate_after, "KL_DIV":kl_after} logs["Total Step"] = sum([len(path["Reward"]) for path in batch_path]) logs["Num episode"] = len([path["Reward"] for path in batch_path]) logs["Total Sum"] = sum([sum(path["Reward"]) for path in batch_path]) logs["Diff Sum"] = sum([path["Reward_diff"] for path in batch_path]) logs["Episode_Avg_reward"] = logs["Total Sum"] / logs["Num episode"] logs["Episode_Avg_diff"] = logs["Diff Sum"] / logs["Num episode"] return logs # Make policy network given states def build_policy(self, states, name='Policy'): print('Initializing Policy network') with tf.variable_scope(name): h1 = LINEAR(states, self.args.hidden_size, name='h1') h1_nl = tf.nn.relu(h1) h2 = LINEAR(h1_nl, self.args.hidden_size, name='h2') h2_nl = tf.nn.relu(h2) h3 = LINEAR(h2_nl, self.action_size, name='h3') # tf.initializer has to be either Tensor object or 'callable' that takes two arguments (shape, dtype) init = lambda shape, dtype, partition_info=None : 0.01*np.random.randn(*shape) # [1, action size] since it has to be constant through batch axis, log standard deviation action_dist_logstd = tf.get_variable('logstd', initializer=init, shape=[1, self.action_size]) return h3, action_dist_logstd def act(self, obs): # Need to expand first dimension(batch axis), make [1, observation size] obs_expanded = np.expand_dims(obs, 0) action_dist_mu, action_dist_logstd = self.sess.run([self.action_dist_mu, self.action_dist_logstd], feed_dict={self.obs:obs_expanded}) # Sample from gaussian distribution action = np.random.normal(loc=action_dist_mu, scale=np.exp(action_dist_logstd)) # All shape would be [1, action size] # print(action) return action, action_dist_mu, action_dist_logstd def rollout(self): # Set tuning parameters to obtain adaptive regularization weight lambda_store = np.zeros(int(self.args.timesteps_per_batch)) lambda_max = 6 factor = 0.3 paths = list() timesteps = 0 counter = 0 #self.num_epi = 0 while timesteps < self.args.timesteps_per_batch: self.num_epi += 1 # print('%d episode starts' % self.num_epi) obs, action, rewards, done, action_dist_mu, action_dist_logstd, reward_diff = [], [], [], [], [], [], [] # Baseline reward using only control prior s0 = self.env.reset_inc() sp = np.copy(s0) reward_prior = 0. for i in range(self.args.max_path_length): a_prior = self.env.getPrior() sp, reward_p, done_p, _ = self.env.step(a_prior) reward_prior += reward_p if done_p: break prev_obs = self.env.reset() ep_reward = 0. for i in range(self.args.max_path_length): # Make 'batch size' axis prev_obs = np.squeeze(prev_obs) prev_obs_expanded = np.expand_dims(prev_obs, 0) # Obtain regularization weight using TD-error if (i > 0 and self.num_epi > 40): # Obtain TD-error base_v = self.gae.predict(old_obs[np.newaxis,:]) target_v = self.gae.predict(prev_obs[np.newaxis,:]) lambda_mix = lambda_max*(1 - np.exp(-factor*np.abs(reward_ + self.args.gamma*np.squeeze(target_v) - np.squeeze(base_v)))) else: self.lambda_actual = 5. lambda_mix = self.lambda_actual if counter < len(lambda_store): lambda_store[counter] = lambda_mix counter += 1 # Prior control a_prior = self.env.getPrior() #All has shape of [1, action size] action_, action_dist_mu_, action_dist_logstd_ = self.act(prev_obs) # Mix the actions (RL controller and control prior) act = action_/(1+self.lambda_actual) + (self.lambda_actual/(1+self.lambda_actual))*a_prior # Take action #next_obs, reward_, done_, _ = self.env.step(action_) old_obs = prev_obs next_obs, reward_, done_, _ = self.env.step(act) ep_reward += reward_ # Store observation obs.append(prev_obs_expanded) action.append(action_) action_dist_mu.append(action_dist_mu_) action_dist_logstd.append(action_dist_logstd_) done.append(done_) rewards.append(reward_) # print(prev_obs, action_, reward_, next_obs, done_) prev_obs = next_obs if done_: # Make dictionary about path, make each element has shape of [None, observation size/action size] path = {"Observation":np.concatenate(obs), "Action":np.concatenate(action), "Action_mu":np.concatenate(action_dist_mu), "Action_logstd":np.concatenate(action_dist_logstd), # [length,] "Done":np.asarray(done), "Reward":np.squeeze(np.asarray(rewards))[:,np.newaxis], "Reward_diff":np.squeeze(np.asarray(ep_reward - reward_prior))} paths.append(path) #print('%d episode finish at %d steps' % (self.num_epi, i+1)) #print(self.lambda_actual) break timesteps += len(rewards) # print('%d steps collected for batch' % timesteps) #print('%d episodes, %d steps is collected for batch' % (self.num_epi, timesteps)) self.lambda_actual = np.mean(lambda_store) print(self.lambda_actual) return paths
def build_model(self): self.obs = tf.placeholder(tf.float32, [None, self.observation_size]) self.action = tf.placeholder(tf.float32, [None, self.action_size]) self.advantage = tf.placeholder(tf.float32, [None]) # Mean of old action distribution self.old_action_dist_mu = tf.placeholder(tf.float32, [None, self.action_size]) self.old_action_dist_logstd = tf.placeholder(tf.float32, [None, self.action_size]) ''' Mean value for each action : each action has gaussian distribution with mean and standard deviation With continuous state and action space, use GAUSSIAN DISTRIBUTION, maps from the input features to the mean of Gaussian distribution for each action Seperate set of parameters specifies the log standard deviation of each action => The policy is defined by the normnal distribution (mean=NeuralNet(states), stddev= exp(r)) ''' self.action_dist_mu, action_dist_logstd = self.build_policy(self.obs) # Make log standard shape from [1, action size] => [batch size, action size] # tf.tile(A, reps) : construct an tensor by repeating A given by 'reps' # Use tf.shape instead of tf.get_shape() when 'None' used in placeholder self.action_dist_logstd = tf.tile(action_dist_logstd, (tf.shape(action_dist_logstd)[0], 1)) # outputs probability of taking 'self.action' # new distribution self.log_policy = LOG_POLICY(self.action_dist_mu, self.action_dist_logstd, self.action) # old distribution self.log_old_policy = LOG_POLICY(self.old_action_dist_mu, self.old_action_dist_logstd, self.action) # Take exponential to log policy distribution ''' Equation (14) in paper Contribution of a single s_n : Expectation over a~q[(new policy / q(is)) * advantace_old] sampling distribution q is normally old policy ''' batch_size = tf.shape(self.obs)[0] # print('Batch size %d' % batch_size) policy_ratio = tf.exp(self.log_policy - self.log_old_policy) surr_single_state = -tf.reduce_mean(policy_ratio * self.advantage) # tf.shape returns dtype=int32, tensor conversion requested dtype float32 batch_size = tf.cast(batch_size, tf.float32) # Average KL divergence and shannon entropy, averaged over a set of inputs to function mu kl = GAUSS_KL(self.old_action_dist_mu, self.old_action_dist_logstd, self.action_dist_mu, self.action_dist_logstd) / batch_size ent = GAUSS_ENTROPY(self.action_dist_mu, self.action_dist_logstd) / batch_size self.losses = [surr_single_state, kl, ent] #tr_vrbs = tf.trainable_variables() tr_vrbs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Policy') for i in tr_vrbs: print(i.op.name) ''' Compute a search direction using a linear approx to objective and quadratic approx to constraint => The search direction is computed by approximately solving 'Ax=g' where A is FIM Quadratic approximation to KL divergence constraint ''' # Maximize surrogate function over policy parameter 'theta' self.pg = FLAT_GRAD(surr_single_state, tr_vrbs) # KL divergence where first argument is fixed # First argument would be old policy parameters, so keep it constant kl_first_fixed = GAUSS_KL_FIRST_FIX(self.action_dist_mu, self.action_dist_logstd) / batch_size # Gradient of KL divergence first_kl_grads = tf.gradients(kl_first_fixed, tr_vrbs) # Vectors we are going to multiply self.flat_tangent = tf.placeholder(tf.float32, [None]) tangent = list() start = 0 for vrbs in tr_vrbs: variable_size = np.prod(vrbs.get_shape().as_list()) param = tf.reshape(self.flat_tangent[start:(start+variable_size)], vrbs.get_shape()) tangent.append(param) start += variable_size ''' Gradient of KL with tangent vector gradient_w_tangent : list of KL_prime*y for each variables ''' gradient_w_tangent = [tf.reduce_sum(kl_g*t) for (kl_g, t) in zip(first_kl_grads, tangent)] ''' From derivative of KL_prime*y : [dKL/dx1, dKL/dx2...]*y y -> Ay, A is n by n matrix but hard to implement(numerically solving (n*n)*(n*1)) so first multiply target 'y' to gradient and take derivation 'self.FVP' Returns : [d2KL/dx1dx1+d2KL/dx1dx2..., d2KL/dx1dx2+d2KL/dx2dx2..., ...]*y So get (second derivative of KL divergence)*y for each variable => y->JMJy (Fisher Vector Product) ''' self.FVP = FLAT_GRAD(gradient_w_tangent, tr_vrbs) # Get actual paramenter value self.get_value = GetValue(self.sess, tr_vrbs, name='Policy') # To set parameter values self.set_value = SetValue(self.sess, tr_vrbs, name='Policy') # GAE self.gae = GAE(self.sess, self.observation_size, self.args.gamma, self.args.lamda, self.args.vf_constraint) self.sess.run(tf.global_variables_initializer())
def main(): # Get arguments parsed args = get_args() # Setup for logging output_dir = 'output/{}'.format( datetime.now( timezone('Asia/Hong_Kong')).strftime('%Y-%m-%d_%H-%M-%S-%f')[:-3]) create_dir(output_dir) LogHelper.setup(log_path='{}/training.log'.format(output_dir), level_str='INFO') _logger = logging.getLogger(__name__) # Save the configuration for logging purpose save_yaml_config(args, path='{}/config.yaml'.format(output_dir)) # Reproducibility set_seed(args.seed) # Get dataset dataset = SyntheticDataset(args.n, args.d, args.graph_type, args.degree, args.sem_type, args.noise_scale, args.dataset_type, args.x_dim) _logger.info('Finished generating dataset') model = GAE(args.n, args.d, args.x_dim, args.seed, args.num_encoder_layers, args.num_decoder_layers, args.hidden_size, args.latent_dim, args.l1_graph_penalty, args.use_float64) model.print_summary(print_func=model.logger.info) trainer = ALTrainer(args.init_rho, args.rho_thres, args.h_thres, args.rho_multiply, args.init_iter, args.learning_rate, args.h_tol, args.early_stopping, args.early_stopping_thres) W_est = trainer.train(model, dataset.X, dataset.W, args.graph_thres, args.max_iter, args.iter_step, output_dir) _logger.info('Finished training model') # Save raw recovered graph, ground truth and observational data after training np.save('{}/true_graph.npy'.format(output_dir), dataset.W) np.save('{}/observational_data.npy'.format(output_dir), dataset.X) np.save('{}/final_raw_recovered_graph.npy'.format(output_dir), W_est) # Plot raw recovered graph plot_recovered_graph( W_est, dataset.W, save_name='{}/raw_recovered_graph.png'.format(output_dir)) _logger.info('Filter by constant threshold') W_est = W_est / np.max(np.abs(W_est)) # Normalize # Plot thresholded recovered graph W_est[np.abs(W_est) < args.graph_thres] = 0 # Thresholding plot_recovered_graph( W_est, dataset.W, save_name='{}/thresholded_recovered_graph.png'.format(output_dir)) results_thresholded = count_accuracy(dataset.W, W_est) _logger.info('Results after thresholding by {}: {}'.format( args.graph_thres, results_thresholded))