Example #1
0
def get_trainer(trial, dataloader):
    n_layers = trial.suggest_categorical('n_layer', [2, 3, 4])
    hidden_dims = []
    for i in range(n_layers):
        hidden_dim = int(
            trial.suggest_loguniform('hidden_dim_{}'.format(i), 4, 256))
        hidden_dims.append(hidden_dim)
    model = GAE(39, hidden_dims)
    lr = trial.suggest_loguniform('lr', 1e-6, 1e-2)
    optim = torch.optim.Adam(model.parameters(), lr=lr)
    trainer = Trainer(model, optim, dataloader)
    return trainer
Example #2
0
def main():
    if not os.path.exists(args.save_dir):
        os.makedirs(os.path.join(save_dir, 'zinc250k.png'))

    model = GAE(args.in_dim, args.hidden_dims)
    model.to(device)

    print('Loading data')
    with open(args.data_file, 'rb') as f:
        graphs = dill.load(f)
    print('Loaded {} molecules'.format(len(graphs)))
    train_graphs, val_graphs = train_test_split(graphs, test_size=10000)
    train_dataset = MolDataset(train_graphs)
    val_dataset = MolDataset(val_graphs)
    del train_graphs, val_graphs

    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              collate_fn=collate)
    val_loader = DataLoader(val_dataset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            collate_fn=collate)
    trainer = Trainer(model, args)
    train_losses, val_losses = [], []
    print('Training Start')
    for epoch in tqdm(range(args.n_epochs)):
        train_loss = 0
        model.train()
        for bg in tqdm(train_loader):
            bg.set_e_initializer(dgl.init.zero_initializer)
            bg.set_n_initializer(dgl.init.zero_initializer)
            train_loss += trainer.iteration(bg)
        train_loss /= len(train_loader)
        train_losses.append(train_loss)
        trainer.save(epoch, args.save_dir)

        val_loss = 0
        model.eval()
        for bg in val_loader:
            bg.set_e_initializer(dgl.init.zero_initializer)
            bg.set_n_initializer(dgl.init.zero_initializer)
            val_loss += trainer.iteration(bg, train=False)
        val_loss /= len(val_loader)
        val_losses.append(val_loss)
        print('Epoch: {:02d} | Train Loss: {:.4f} | Validation Loss: {:.4f}'.
              format(epoch, train_loss, val_loss))
    plot(train_losses, val_losses)
Example #3
0
def main():
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)

    # TODO: train test split
    # load and preprocess dataset
    data = load_data(args)
    features = torch.FloatTensor(data.features)
    in_feats = features.shape[1]
    print(features.shape)
    model = GAE(in_feats, [32,16])
    model.train()
    optim = torch.optim.Adam(model.parameters(), lr=1e-2)
    loss_function = BCELoss

    g = DGLGraph(data.graph)
    g.ndata['h'] = features


    n_epochs = 500
    losses = []
    print('Training Start')
    for epoch in tqdm(range(n_epochs)):
        g.ndata['h'] = features
        # normalization
        degs = g.in_degrees().float()
        norm = torch.pow(degs, -0.5)
        norm[torch.isinf(norm)] = 0
        g.ndata['norm'] = norm.unsqueeze(1)
        adj = g.adjacency_matrix().to_dense()
        pos_weight = torch.Tensor([float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()])
        
        
        adj_logits = model.forward(g)#, features)
        
        loss = loss_function(adj_logits, adj, pos_weight=pos_weight)
        optim.zero_grad()
        loss.backward()
        optim.step()
        losses.append(loss.item())
        print('Epoch: {:02d} | Loss: {:.5f}'.format(epoch, loss))
        
    
    plt.plot(losses)
    plt.xlabel('iteration')
    plt.ylabel('train loss')
    plt.grid()
    plt.show()
Example #4
0
    def build_model(self):
        self.obs = tf.placeholder(tf.float32, [None, self.observation_size])
        self.action = tf.placeholder(tf.float32, [None, self.action_size])
        self.advantage = tf.placeholder(tf.float32, [None])

        #Mean of old action distribution
        self.old_action_dist_mu = tf.placeholder(tf.float32,
                                                 [None, self.action_size])
        self.old_action_dist_logstd = tf.placeholder(tf.float32,
                                                     [None, self.action_size])

        #NN framework for action distribution
        self.action_dist_mu, action_dist_logstd = self.build_policy(self.obs)

        # Get trainable variables for the policy (NN weights)
        tr_vrbs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                    scope='Policy')
        for i in tr_vrbs:
            print(i.op.name)

        #Construct distribution by repeating action_dis_logstd
        self.action_dist_logstd = tf.tile(action_dist_logstd,
                                          (tf.shape(action_dist_logstd)[0], 1))

        #Probability of action under old policy vs. new policy
        self.log_policy = LOG_POLICY(self.action_dist_mu,
                                     self.action_dist_logstd, self.action)
        self.log_old_policy = LOG_POLICY(self.old_action_dist_mu,
                                         self.old_action_dist_logstd,
                                         self.action)
        policy_ratio = tf.exp(self.log_policy - self.log_old_policy)

        #Number of observations in batch
        batch_size = tf.cast(tf.shape(self.obs)[0], tf.float32)
        '''
        Equation (14) in paper
        Contribution of a single s_n : Expectation over a~q[ (new policy / q(is)) * advantage_old]
        '''
        surr_single_state = -tf.reduce_mean(policy_ratio * self.advantage)

        #Define KL divergence and shannon entropy, averaged over a set of inputs (policies)
        kl = GAUSS_KL(self.old_action_dist_mu, self.old_action_dist_logstd,
                      self.action_dist_mu,
                      self.action_dist_logstd) / batch_size
        ent = GAUSS_ENTROPY(self.action_dist_mu,
                            self.action_dist_logstd) / batch_size

        #Define 'loss' quantities to constrain or maximize
        self.losses = [surr_single_state, kl, ent]

        # Maximize surrogate function over policy parameter 'theta' represented by neural network weights
        self.pg = FLAT_GRAD(surr_single_state, tr_vrbs)

        #KL divergence where first argument is fixed
        kl_first_fixed = GAUSS_KL_FIRST_FIX(
            self.action_dist_mu, self.action_dist_logstd) / batch_size

        #Gradient of KL divergence w.r.t. theta (NN policy weights)
        first_kl_grads = tf.gradients(kl_first_fixed, tr_vrbs)

        self.flat_tangent = tf.placeholder(tf.float32, [None])
        tangent = list()
        start = 0
        for vrbs in tr_vrbs:
            variable_size = np.prod(vrbs.get_shape().as_list())
            param = tf.reshape(
                self.flat_tangent[start:(start + variable_size)],
                vrbs.get_shape())
            tangent.append(param)
            start += variable_size
        '''
            Gradient of KL with tangent vector
            gradient_w_tangent : list of KL_prime*y for each variables
        '''
        gradient_w_tangent = [
            tf.reduce_sum(kl_g * t)
            for (kl_g, t) in zip(first_kl_grads, tangent)
        ]
        '''
			From derivative of KL_prime*y : [dKL/dx1, dKL/dx2...]*y
				y -> Ay, A is n by n matrix but hard to implement(numerically solving (n*n)*(n*1))
				so first multiply target 'y' to gradient and take derivation
		    'self.FVP'	Returns : [d2KL/dx1dx1+d2KL/dx1dx2..., d2KL/dx1dx2+d2KL/dx2dx2..., ...]*y
			So get (second derivative of KL divergence)*y for each variable => y->JMJy (Fisher Vector Product)
		'''
        self.FVP = FLAT_GRAD(gradient_w_tangent, tr_vrbs)

        #Get actual parameter value
        self.get_value = GetValue(self.sess, tr_vrbs, name='Policy')

        #Set parameter values
        self.set_value = SetValue(self.sess, tr_vrbs, name='Policy')

        #Estimate of the advantage function
        self.gae = GAE(self.sess, self.observation_size, self.args.gamma,
                       self.args.lamda, self.args.vf_constraint)

        #Intialization of the barrier function compensator
        self.bar_comp = BARRIER(self.args, self.sess, self.observation_size,
                                self.action_size)

        #Variable initializers
        self.sess.run(tf.global_variables_initializer())
Example #5
0
class TRPO():
    def __init__(self, args, env, sess):
        self.args = args
        self.sess = sess
        self.env = env
        self.firstIter = 1
        self.torque_bound = 100

        #Determine dimensions of observation & action space
        self.observation_size = 15
        self.action_size = 1

        # Build neural network model for observations/actions
        self.build_model()

        # Build barrier function model
        cbf.build_barrier(self)

        # Build GP model
        dynamics_gp.build_GP_model(self)

    # Build RL policy improvement model based on TRPO
    def build_model(self):
        self.obs = tf.placeholder(tf.float32, [None, self.observation_size])
        self.action = tf.placeholder(tf.float32, [None, self.action_size])
        self.advantage = tf.placeholder(tf.float32, [None])

        #Mean of old action distribution
        self.old_action_dist_mu = tf.placeholder(tf.float32,
                                                 [None, self.action_size])
        self.old_action_dist_logstd = tf.placeholder(tf.float32,
                                                     [None, self.action_size])

        #NN framework for action distribution
        self.action_dist_mu, action_dist_logstd = self.build_policy(self.obs)

        # Get trainable variables for the policy (NN weights)
        tr_vrbs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                    scope='Policy')
        for i in tr_vrbs:
            print(i.op.name)

        #Construct distribution by repeating action_dis_logstd
        self.action_dist_logstd = tf.tile(action_dist_logstd,
                                          (tf.shape(action_dist_logstd)[0], 1))

        #Probability of action under old policy vs. new policy
        self.log_policy = LOG_POLICY(self.action_dist_mu,
                                     self.action_dist_logstd, self.action)
        self.log_old_policy = LOG_POLICY(self.old_action_dist_mu,
                                         self.old_action_dist_logstd,
                                         self.action)
        policy_ratio = tf.exp(self.log_policy - self.log_old_policy)

        #Number of observations in batch
        batch_size = tf.cast(tf.shape(self.obs)[0], tf.float32)
        '''
        Equation (14) in paper
        Contribution of a single s_n : Expectation over a~q[ (new policy / q(is)) * advantage_old]
        '''
        surr_single_state = -tf.reduce_mean(policy_ratio * self.advantage)

        #Define KL divergence and shannon entropy, averaged over a set of inputs (policies)
        kl = GAUSS_KL(self.old_action_dist_mu, self.old_action_dist_logstd,
                      self.action_dist_mu,
                      self.action_dist_logstd) / batch_size
        ent = GAUSS_ENTROPY(self.action_dist_mu,
                            self.action_dist_logstd) / batch_size

        #Define 'loss' quantities to constrain or maximize
        self.losses = [surr_single_state, kl, ent]

        # Maximize surrogate function over policy parameter 'theta' represented by neural network weights
        self.pg = FLAT_GRAD(surr_single_state, tr_vrbs)

        #KL divergence where first argument is fixed
        kl_first_fixed = GAUSS_KL_FIRST_FIX(
            self.action_dist_mu, self.action_dist_logstd) / batch_size

        #Gradient of KL divergence w.r.t. theta (NN policy weights)
        first_kl_grads = tf.gradients(kl_first_fixed, tr_vrbs)

        self.flat_tangent = tf.placeholder(tf.float32, [None])
        tangent = list()
        start = 0
        for vrbs in tr_vrbs:
            variable_size = np.prod(vrbs.get_shape().as_list())
            param = tf.reshape(
                self.flat_tangent[start:(start + variable_size)],
                vrbs.get_shape())
            tangent.append(param)
            start += variable_size
        '''
            Gradient of KL with tangent vector
            gradient_w_tangent : list of KL_prime*y for each variables
        '''
        gradient_w_tangent = [
            tf.reduce_sum(kl_g * t)
            for (kl_g, t) in zip(first_kl_grads, tangent)
        ]
        '''
			From derivative of KL_prime*y : [dKL/dx1, dKL/dx2...]*y
				y -> Ay, A is n by n matrix but hard to implement(numerically solving (n*n)*(n*1))
				so first multiply target 'y' to gradient and take derivation
		    'self.FVP'	Returns : [d2KL/dx1dx1+d2KL/dx1dx2..., d2KL/dx1dx2+d2KL/dx2dx2..., ...]*y
			So get (second derivative of KL divergence)*y for each variable => y->JMJy (Fisher Vector Product)
		'''
        self.FVP = FLAT_GRAD(gradient_w_tangent, tr_vrbs)

        #Get actual parameter value
        self.get_value = GetValue(self.sess, tr_vrbs, name='Policy')

        #Set parameter values
        self.set_value = SetValue(self.sess, tr_vrbs, name='Policy')

        #Estimate of the advantage function
        self.gae = GAE(self.sess, self.observation_size, self.args.gamma,
                       self.args.lamda, self.args.vf_constraint)

        #Intialization of the barrier function compensator
        self.bar_comp = BARRIER(self.args, self.sess, self.observation_size,
                                self.action_size)

        #Variable initializers
        self.sess.run(tf.global_variables_initializer())

    #Train TRPO policy
    def train(self, iteration):
        batch_path = self.rollout()
        theta_prev = self.get_value()

        #Get advantage from gae (train value function NN)
        advantage_estimated = self.gae.get_advantage(batch_path)

        #Get barrier compensator from barrier_comp (train compensator NN)
        if (iteration < 10):
            self.bar_comp.get_training_rollouts(batch_path)
            barr_loss = self.bar_comp.train()
        else:
            barr_loss = 0.

        #Put all paths in batch in a numpy array to feed to network as [batch size, action/observation size]
        #Those batches come from OLD policy before updating theta
        #action_dist_mu = np.squeeze(np.concatenate([each_path["Action_mu"] for each_path in batch_path]))
        action_dist_mu = np.squeeze(
            np.concatenate(
                [each_path["Action_RL_mu"] for each_path in batch_path]))
        action_dist_logstd = np.squeeze(
            np.concatenate(
                [each_path["Action_logstd"] for each_path in batch_path]))
        observation = np.squeeze(
            np.concatenate(
                [each_path["Observation"] for each_path in batch_path]))
        action = np.squeeze(
            np.concatenate(
                [each_path["Action_RL"] for each_path in batch_path]))
        #action = np.squeeze(np.concatenate([each_path["Action"] for each_path in batch_path]))

        #Obtain policy gradient of advantage function w.r.t. theta (g in paper)
        feed_dict = {
            self.obs: observation,
            self.action: np.expand_dims(action, axis=1),
            self.advantage: advantage_estimated,
            self.old_action_dist_mu: np.expand_dims(action_dist_mu, axis=1),
            self.old_action_dist_logstd: np.expand_dims(action_dist_logstd,
                                                        axis=1)
        }
        #feed_dict = {self.obs:observation, self.action:action, self.advantage:advantage_estimated, self.old_action_dist_mu:action_dist_mu, self.old_action_dist_logstd:action_dist_logstd}
        policy_g = self.sess.run(self.pg, feed_dict=feed_dict)

        # Computing fisher vector product : FIM * (policy gradient) where FIM = Fisher Information Matrix
        def fisher_vector_product(gradient):
            feed_dict[self.flat_tangent] = gradient
            return self.sess.run(self.FVP, feed_dict=feed_dict)

        #Solve Ax = g, where A is FIM and g is gradient of policy network, to obtain search direction for theta
        search_direction = CONJUGATE_GRADIENT(fisher_vector_product, -policy_g)

        #KL divergence approximated by 1/2*(delta_transpose)*FIM*delta
        #Appendix C in TRPO Paper
        kl_approximated = 0.5 * search_direction.dot(
            fisher_vector_product(search_direction))

        #Calculate theta update
        maximal_step_length = np.sqrt(self.args.kl_constraint /
                                      kl_approximated)
        full_step = maximal_step_length * search_direction

        #Reverse gradient direction
        #full_step = -maximal_step_length * search_direction

        def surrogate(theta):
            self.set_value(theta)
            return self.sess.run(self.losses[0], feed_dict=feed_dict)

        #Use line search to ensure improvement of surrogate objective and satisfaction of KL constraint
        #Start with maximal step length and exponentially shrink until objective improves
        new_theta = LINE_SEARCH(surrogate,
                                theta_prev,
                                full_step,
                                self.args.num_backtracking,
                                name='Surrogate loss')

        #Update without line search
        #new_theta = theta_prev + full_step

        #Update policy parameter theta
        self.set_value(new_theta, update_info=0)

        #Update value function neural network
        #Policy update is performed using old value function parameter
        self.gae.train()

        #After update, store values at log
        surrogate_after, kl_after, _ = self.sess.run(self.losses,
                                                     feed_dict=feed_dict)
        logs = {"Surrogate loss": surrogate_after, "KL_DIV": kl_after}
        logs["Total Step"] = sum([len(path["Reward"]) for path in batch_path])
        logs["Num episode"] = len([path["Reward"] for path in batch_path])
        logs["Total Sum"] = sum([sum(path["Reward"]) for path in batch_path])
        logs["Episode_Avg_Reward"] = logs["Total Sum"] / logs["Num episode"]
        logs["Compensator_Fit"] = barr_loss
        logs["Final_Action"] = np.squeeze(
            np.concatenate([each_path["Action"] for each_path in batch_path]))
        logs["Action_bar"] = np.squeeze(
            np.concatenate(
                [each_path["Action_bar"] for each_path in batch_path]))
        logs["Action_BAR"] = np.squeeze(
            np.concatenate(
                [each_path["Action_BAR"] for each_path in batch_path]))
        logs["Observation"] = np.squeeze(
            np.concatenate(
                [each_path["Observation"] for each_path in batch_path]))
        logs["Reward"] = np.squeeze(
            np.concatenate([each_path["Reward"] for each_path in batch_path]))
        logs["Done"] = np.squeeze(
            np.concatenate([each_path["Done"] for each_path in batch_path]))
        return logs

    #Set up NN to parameterize the control policy
    def build_policy(self, states, name='Policy'):
        print('Initializing Policy network')
        with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
            h1 = LINEAR(states, self.args.hidden_size, name='h1')
            h1_n1 = tf.nn.sigmoid(h1)
            h2 = LINEAR(h1_n1, self.args.hidden_size, name='h2')
            h2_n1 = tf.nn.sigmoid(h2)
            h3 = LINEAR(h2_n1, self.action_size, name='h3')

            #Initialize action std_deviation
            #init = lambda shape, dtype, partition_info=None : 0.01*np.random.randn(*shape)
            #action_dist_logstd = tf.get_variable('logstd', initializer=init, shape=[1, self.action_size])

            #Initialize action std_deviation (no variance -- deterministic policy)
            action_dist_logstd = tf.get_variable(
                'logstd',
                initializer=tf.constant_initializer(0),
                shape=[1, self.action_size])

        return h3, action_dist_logstd

    #Get action from the current observation (sampled based on NN policy)
    def act(self, obs):
        #Need to expand first dimension (batch axis), make [1, observation size]
        obs_expanded = np.expand_dims(np.squeeze(obs), 0)
        #obs_expanded = obs
        #Get action distribution from policy network
        action_dist_mu, action_dist_logstd = self.sess.run(
            [self.action_dist_mu, self.action_dist_logstd],
            feed_dict={self.obs: obs_expanded})
        #Sample action from gaussian distribution
        action = np.random.normal(loc=action_dist_mu,
                                  scale=np.exp(action_dist_logstd))
        return action, action_dist_mu, action_dist_logstd

    #Simulate dynamics for a given rollout
    def rollout(self):
        #Initialize variables
        paths = list()
        timesteps = 0
        self.num_epi = 0

        #Utilize GP from previous iteration while training current iteration
        if (self.firstIter == 1):
            pass
        else:
            self.GP_model_prev = self.GP_model.copy()
            dynamics_gp.build_GP_model(self)

        #Iterate through the specified number of episodes
        while timesteps < self.args.timesteps_per_batch:
            self.num_epi += 1

            #Reset the environment
            obs, action, rewards, done, action_dist_mu, action_dist_logstd, action_bar, action_BAR, action_RL_mu_, action_RL_ = [], [], [], [], [], [], [], [], [], []
            prev_obs = self.env.reset()
            obs = np.expand_dims(np.squeeze(prev_obs), 0)

            #Simulate dynamics for specified time
            for i in range(self.args.max_path_length):
                #self.env.render()
                prev_obs_expanded = np.expand_dims(np.squeeze(prev_obs), 0)
                #prev_obs_expanded = prev_obs
                #Agent takes actions from sampled action and action distribution parameters based on observation
                #All have shape of [1, action size]
                action_rl, action_dist_mu_rl, action_dist_logstd_ = self.act(
                    prev_obs)

                #Utilize compensation barrier function
                u_BAR_ = self.bar_comp.get_action(prev_obs)
                action_RL = action_rl + u_BAR_
                action_dist_mu_RL = action_dist_mu_rl + u_BAR_

                t = 0.05 * i
                # Get GP dynamics
                if (self.firstIter == 1):
                    [f, g, x, std
                     ] = dynamics_gp.get_GP_dynamics(self, prev_obs_expanded,
                                                     action_RL, t)
                else:
                    [f, g, x, std] = dynamics_gp.get_GP_dynamics_prev(
                        self, prev_obs_expanded, action_RL, t)

                #Utilize safety barrier function
                u_bar_ = cbf.control_barrier(self,
                                             np.squeeze(prev_obs_expanded),
                                             action_dist_mu_RL, f, g, x, std)
                #action_ = action_RL + u_bar_
                action_dist_mu_ = action_dist_mu_RL + u_bar_

                #Stochastic action
                action_ = np.random.normal(loc=action_dist_mu_,
                                           scale=np.exp(action_dist_logstd_))

                #Store observation and action/distribution
                obs = np.append(obs, prev_obs_expanded, axis=0)
                action_RL_mu_.append(action_dist_mu_rl)
                action_RL_.append(action_rl)
                action_bar.append(u_bar_)
                action_BAR.append(u_BAR_)
                action.append(action_)
                action_dist_mu.append(action_dist_mu_)
                action_dist_logstd.append(action_dist_logstd_)

                # Simulate dynamics after action
                next_obs, reward_, done_ = self.env.step(action_)
                reward_ = np.squeeze(reward_)
                #next_obs, reward_, done_, _ = self.env.step(action_)

                #Get results
                done.append(done_)
                rewards.append(reward_)
                prev_obs = next_obs

                if i == self.args.max_path_length - 1:
                    obs = obs[1:self.args.max_path_length + 1, :]
                    path = {
                        "Observation": obs,
                        "Action": np.concatenate(action),
                        "Action_RL_mu": np.concatenate(action_RL_mu_),
                        "Action_RL": np.concatenate(action_RL_),
                        "Action_mu": np.concatenate(action_dist_mu),
                        "Action_bar": np.concatenate(action_bar),
                        "Action_BAR": np.concatenate(action_BAR),
                        "Action_logstd": np.concatenate(action_dist_logstd),
                        "Done": np.asarray(done),
                        "Reward": np.asarray(rewards)
                    }
                    paths.append(path)
                    break

            #For timing purposes, only update GP dynamics for certain number of timesteps
            if (timesteps < 500):
                dynamics_gp.update_GP_dynamics(self, path)
            timesteps += len(rewards)
        #print('%d episodes, %d steps collected for batch' % (self.num_epi, timesteps))
        self.firstIter = 0
        return paths
Example #6
0
class TRPO():
    def __init__(self, args, env, sess):
        self.args = args
        self.sess = sess
        self.env = env
        self.torque_bound = 8

        #Set up observation space and action space
        self.observation_space = env.observation_space
        self.action_space = env.action_space
        print('Observation space', self.observation_space)
        print('Action space', self.action_space)

        #Determine dimensions of observation & action space
        self.observation_size = self.env.observation_space.shape[0]
        self.action_size = self.action_space.shape[0]

        # Build neural network model for observations/actions
        self.build_model()

        # Build barrier function model
        self.build_barrier()

    #Build barrier function model
    def build_barrier(self):
        N = self.action_size
        #self.P = matrix(np.eye(N), tc='d')
        self.P = matrix(np.diag([1., 10000000.]), tc='d')
        self.q = matrix(np.zeros(N + 1))
        self.H1 = np.array([1, 0.001])
        self.H2 = np.array([1, -0.001])
        self.H3 = np.array([-1, 0.001])
        self.H4 = np.array([-1, -0.001])
        self.F = 1

    # Build RL policy improvement model based on TRPO
    def build_model(self):
        self.obs = tf.placeholder(tf.float32, [None, self.observation_size])
        self.action = tf.placeholder(tf.float32, [None, self.action_size])
        self.advantage = tf.placeholder(tf.float32, [None])

        #Mean of old action distribution
        self.old_action_dist_mu = tf.placeholder(tf.float32,
                                                 [None, self.action_size])
        self.old_action_dist_logstd = tf.placeholder(tf.float32,
                                                     [None, self.action_size])

        #NN framework for action distribution
        self.action_dist_mu, action_dist_logstd = self.build_policy(self.obs)

        # Get trainable variables for the policy (NN weights)
        tr_vrbs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                    scope='Policy')
        for i in tr_vrbs:
            print(i.op.name)

        #Construct distribution by repeating action_dis_logstd
        self.action_dist_logstd = tf.tile(action_dist_logstd,
                                          (tf.shape(action_dist_logstd)[0], 1))

        #Probability of action under old policy vs. new policy
        self.log_policy = LOG_POLICY(self.action_dist_mu,
                                     self.action_dist_logstd, self.action)
        self.log_old_policy = LOG_POLICY(self.old_action_dist_mu,
                                         self.old_action_dist_logstd,
                                         self.action)
        policy_ratio = tf.exp(self.log_policy - self.log_old_policy)

        #Number of observations in batch
        batch_size = tf.cast(tf.shape(self.obs)[0], tf.float32)
        '''
        Equation (14) in paper
        Contribution of a single s_n : Expectation over a~q[ (new policy / q(is)) * advantage_old]
        '''
        surr_single_state = -tf.reduce_mean(policy_ratio * self.advantage)

        #Define KL divergence and shannon entropy, averaged over a set of inputs (policies)
        kl = GAUSS_KL(self.old_action_dist_mu, self.old_action_dist_logstd,
                      self.action_dist_mu,
                      self.action_dist_logstd) / batch_size
        ent = GAUSS_ENTROPY(self.action_dist_mu,
                            self.action_dist_logstd) / batch_size

        #Define 'loss' quantities to constrain or maximize
        self.losses = [surr_single_state, kl, ent]

        # Maximize surrogate function over policy parameter 'theta' represented by neural network weights
        self.pg = FLAT_GRAD(surr_single_state, tr_vrbs)

        #KL divergence where first argument is fixed
        kl_first_fixed = GAUSS_KL_FIRST_FIX(
            self.action_dist_mu, self.action_dist_logstd) / batch_size

        #Gradient of KL divergence w.r.t. theta (NN policy weights)
        first_kl_grads = tf.gradients(kl_first_fixed, tr_vrbs)
        '''
        REVIEW FROM HERE ONWARDS
        #??????????????????????????????????????????????????????????
        '''
        self.flat_tangent = tf.placeholder(tf.float32, [None])
        tangent = list()
        start = 0
        for vrbs in tr_vrbs:
            variable_size = np.prod(vrbs.get_shape().as_list())
            param = tf.reshape(
                self.flat_tangent[start:(start + variable_size)],
                vrbs.get_shape())
            tangent.append(param)
            start += variable_size
        '''
            Gradient of KL with tangent vector
            gradient_w_tangent : list of KL_prime*y for each variables
        '''
        gradient_w_tangent = [
            tf.reduce_sum(kl_g * t)
            for (kl_g, t) in zip(first_kl_grads, tangent)
        ]
        '''
			From derivative of KL_prime*y : [dKL/dx1, dKL/dx2...]*y
				y -> Ay, A is n by n matrix but hard to implement(numerically solving (n*n)*(n*1))
				so first multiply target 'y' to gradient and take derivation
		    'self.FVP'	Returns : [d2KL/dx1dx1+d2KL/dx1dx2..., d2KL/dx1dx2+d2KL/dx2dx2..., ...]*y
			So get (second derivative of KL divergence)*y for each variable => y->JMJy (Fisher Vector Product)
		'''
        self.FVP = FLAT_GRAD(gradient_w_tangent, tr_vrbs)

        #Get actual parameter value
        self.get_value = GetValue(self.sess, tr_vrbs, name='Policy')

        #Set parameter values
        self.set_value = SetValue(self.sess, tr_vrbs, name='Policy')

        #Estimate of the advantage function
        self.gae = GAE(self.sess, self.observation_size, self.args.gamma,
                       self.args.lamda, self.args.vf_constraint)

        #Intialization of the barrier function compensator
        self.bar_comp = BARRIER(self.args, self.sess, self.observation_size,
                                self.action_size)

        #Variable initializers
        self.sess.run(tf.global_variables_initializer())

    #Train TRPO policy
    def train(self):
        batch_path = self.rollout()
        theta_prev = self.get_value()

        #Get advantage from gae (train value function NN)
        advantage_estimated = self.gae.get_advantage(batch_path)

        #Get barrier compensator from barrier_comp (train compensator NN)
        self.bar_comp.get_training_rollouts(batch_path)
        barr_loss = self.bar_comp.train()

        #Put all paths in batch in a numpy array to feed to network as [batch size, action/observation size]
        #Those batches come from OLD policy before updating theta
        action_dist_mu = np.squeeze(
            np.concatenate(
                [each_path["Action_mu"] for each_path in batch_path]))
        action_dist_logstd = np.squeeze(
            np.concatenate(
                [each_path["Action_logstd"] for each_path in batch_path]))
        observation = np.squeeze(
            np.concatenate(
                [each_path["Observation"] for each_path in batch_path]))
        action = np.squeeze(
            np.concatenate([each_path["Action"] for each_path in batch_path]))

        #Obtain policy gradient of advantage function w.r.t. theta (g in paper)
        feed_dict = {
            self.obs: observation,
            self.action: np.expand_dims(action, axis=1),
            self.advantage: advantage_estimated,
            self.old_action_dist_mu: np.expand_dims(action_dist_mu, axis=1),
            self.old_action_dist_logstd: np.expand_dims(action_dist_logstd,
                                                        axis=1)
        }
        #feed_dict = {self.obs:observation, self.action:action, self.advantage:advantage_estimated, self.old_action_dist_mu:action_dist_mu, self.old_action_dist_logstd:action_dist_logstd}
        policy_g = self.sess.run(self.pg, feed_dict=feed_dict)

        # Computing fisher vector product : FIM * (policy gradient) where FIM = Fisher Information Matrix
        def fisher_vector_product(gradient):
            feed_dict[self.flat_tangent] = gradient
            return self.sess.run(self.FVP, feed_dict=feed_dict)

        #Solve Ax = g, where A is FIM and g is gradient of policy network, to obtain search direction for theta
        search_direction = CONJUGATE_GRADIENT(fisher_vector_product, -policy_g)

        #KL divergence approximated by 1/2*(delta_transpose)*FIM*delta
        #Appendix C in TRPO Paper
        kl_approximated = 0.5 * search_direction.dot(
            fisher_vector_product(search_direction))

        #Calculate theta update
        maximal_step_length = np.sqrt(self.args.kl_constraint /
                                      kl_approximated)
        full_step = maximal_step_length * search_direction

        def surrogate(theta):
            self.set_value(theta)
            return self.sess.run(self.losses[0], feed_dict=feed_dict)

        #Use line search to ensure improvement of surrogate objective and satisfaction of KL constraint
        #Start with maximal step length and exponentially shrink until objective improves
        new_theta = LINE_SEARCH(surrogate,
                                theta_prev,
                                full_step,
                                self.args.num_backtracking,
                                name='Surrogate loss')

        #Update without line search
        #new_theta = theta_prev + full_step

        #Update policy parameter theta
        self.set_value(new_theta, update_info=0)

        #Update value function neural network
        #Policy update is performed using old value function parameter
        self.gae.train()

        #After update, store values at log
        surrogate_after, kl_after, _ = self.sess.run(self.losses,
                                                     feed_dict=feed_dict)
        logs = {"Surrogate loss": surrogate_after, "KL_DIV": kl_after}
        logs["Total Step"] = sum([len(path["Reward"]) for path in batch_path])
        logs["Num episode"] = len([path["Reward"] for path in batch_path])
        logs["Total Sum"] = sum([sum(path["Reward"]) for path in batch_path])
        logs["Episode Avg. Reward"] = logs["Total Sum"] / logs["Num episode"]
        logs["Compensator_Fit"] = barr_loss
        logs["Final_Action"] = np.squeeze(
            np.concatenate([each_path["Action"] for each_path in batch_path]))
        logs["Action_bar"] = np.squeeze(
            np.concatenate(
                [each_path["Action_bar"] for each_path in batch_path]))
        logs["Action_BAR"] = np.squeeze(
            np.concatenate(
                [each_path["Action_BAR"] for each_path in batch_path]))
        logs["Observation"] = np.squeeze(
            np.concatenate(
                [each_path["Observation"] for each_path in batch_path]))
        logs["Reward"] = np.squeeze(
            np.concatenate([each_path["Reward"] for each_path in batch_path]))
        return logs

    #Set up NN to parameterize the control policy
    def build_policy(self, states, name='Policy'):
        print('Initializing Policy network')
        with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
            h1 = LINEAR(states, self.args.hidden_size, name='h1')
            h1_n1 = tf.nn.relu(h1)
            h2 = LINEAR(h1_n1, self.args.hidden_size, name='h2')
            h2_n1 = tf.nn.relu(h2)
            h3 = LINEAR(h2_n1, self.action_size, name='h3')

            #Initialize action std_deviation
            #init = lambda shape, dtype, partition_info=None : 0.01*np.random.randn(*shape)
            #action_dist_logstd = tf.get_variable('logstd', initializer=init, shape=[1, self.action_size])

            #Initialize action std_deviation (no variance -- deterministic policy)
            action_dist_logstd = tf.get_variable(
                'logstd',
                initializer=tf.constant_initializer(-1.5),
                shape=[1, self.action_size])

        return h3, action_dist_logstd

    #Get action from the current observation (sampled based on NN policy)
    def act(self, obs):
        #Need to expand first dimension (batch axis), make [1, observation size]
        obs_expanded = np.expand_dims(np.squeeze(obs), 0)
        #obs_expanded = obs
        #Get action distribution from policy network
        action_dist_mu, action_dist_logstd = self.sess.run(
            [self.action_dist_mu, self.action_dist_logstd],
            feed_dict={self.obs: obs_expanded})
        #Sample action from gaussian distribution
        action = np.random.normal(loc=action_dist_mu,
                                  scale=np.exp(action_dist_logstd))
        return action, action_dist_mu, action_dist_logstd

    #Get compensatory action based on satisfaction of barrier function
    def control_barrier(self, obs, u_rl):
        #Define gamma for the barrier function
        gamma_b = 0.5

        #Get the dynamics of the system from the current time step with the RL action
        def get_dynamics(obs, u_rl):
            dt = 0.05
            G = 10
            m = 1
            l = 1
            obs = np.squeeze(obs)
            theta = np.arctan2(obs[1], obs[0])
            theta_dot = obs[2]
            f = np.array([
                -3 * G / (2 * l) * np.sin(theta + np.pi) * dt**2 +
                theta_dot * dt + theta + 3 / (m * l**2) * u_rl * dt**2,
                theta_dot - 3 * G / (2 * l) * np.sin(theta + np.pi) * dt + 3 /
                (m * l**2) * u_rl * dt
            ])
            g = np.array([3 / (m * l**2) * dt**2, 3 / (m * l**2) * dt])
            x = np.array([theta, theta_dot])
            return [np.squeeze(f), np.squeeze(g), np.squeeze(x)]

        [f, g, x] = get_dynamics(obs, u_rl)

        #Set up Quadratic Program to satisfy the Control Barrier Function
        G = np.array([[
            np.dot(self.H1, g),
            np.dot(self.H2, g),
            np.dot(self.H3, g),
            np.dot(self.H4, g), 1., -1.
        ], [1, 1, 1, 1, 0, 0]])
        G = np.transpose(G)
        h = np.array([
            gamma_b * self.F - np.dot(self.H1, f) +
            (1 - gamma_b) * np.dot(self.H1, x), gamma_b * self.F -
            np.dot(self.H2, f) + (1 - gamma_b) * np.dot(self.H2, x),
            gamma_b * self.F - np.dot(self.H3, f) +
            (1 - gamma_b) * np.dot(self.H3, x), gamma_b * self.F -
            np.dot(self.H4, f) + (1 - gamma_b) * np.dot(self.H4, x),
            self.torque_bound - u_rl, self.torque_bound + u_rl
        ])

        #Convert numpy arrays to cvx matrices to set up QP
        G = matrix(G, tc='d')
        h = matrix(h, tc='d')

        solvers.options['show_progress'] = False
        sol = solvers.qp(self.P, self.q, G, h)
        u_bar = sol['x']

        if (np.add(np.squeeze(u_rl), np.squeeze(u_bar[0])) - 0.001 >=
                self.torque_bound):
            u_bar[0] = self.torque_bound - u_rl
            print("Error in QP")
        elif (np.add(np.squeeze(u_rl), np.squeeze(u_bar[0])) + 0.001 <=
              -self.torque_bound):
            u_bar[0] = -self.torque_bound - u_rl
            print("Error in QP")
        else:
            pass

        return np.expand_dims(np.array(u_bar[0]), 0)

    #Simulate dynamics for a given rollout
    def rollout(self):
        #Initialize variables
        paths = list()
        timesteps = 0
        self.num_epi = 0
        #Iterate through the specified number of episodes
        while timesteps < self.args.timesteps_per_batch:
            self.num_epi += 1

            #Reset the environment
            obs, action, rewards, done, action_dist_mu, action_dist_logstd, action_bar, action_BAR = [], [], [], [], [], [], [], []
            prev_obs = self.env.reset()

            #Simulate dynamics for specified time
            for i in range(self.args.max_path_length):
                #self.env.render()
                prev_obs_expanded = np.expand_dims(np.squeeze(prev_obs), 0)
                #prev_obs_expanded = prev_obs
                #Agent takes actions from sampled action and action distribution parameters based on observation
                #All have shape of [1, action size]
                action_rl, action_dist_mu_rl, action_dist_logstd_ = self.act(
                    prev_obs)

                #Utilize compensation barrier function
                u_BAR_ = self.bar_comp.get_action(prev_obs)
                action_RL = action_rl + u_BAR_
                action_dist_mu_RL = action_dist_mu_rl + u_BAR_

                #Utilize safety barrier function
                u_bar_ = self.control_barrier(np.squeeze(prev_obs_expanded),
                                              action_dist_mu_RL)
                #action_ = action_RL + u_bar_
                action_dist_mu_ = action_dist_mu_RL + u_bar_

                #Stochastic action
                action_ = np.random.normal(loc=action_dist_mu_,
                                           scale=np.exp(action_dist_logstd_))

                #Store observation and action/distribution
                obs.append(prev_obs_expanded)
                action_bar.append(u_bar_)
                action_BAR.append(u_BAR_)
                action.append(action_)
                action_dist_mu.append(action_dist_mu_)
                action_dist_logstd.append(action_dist_logstd_)

                # Simulate dynamics after action
                next_obs, reward_, done_, _ = self.env.step(action_)

                #Get results
                done.append(done_)
                rewards.append(reward_)
                prev_obs = next_obs

                if done_:
                    path = {
                        "Observation": np.concatenate(obs),
                        "Action": np.concatenate(action),
                        "Action_mu": np.concatenate(action_dist_mu),
                        "Action_bar": np.concatenate(action_bar),
                        "Action_BAR": np.concatenate(action_BAR),
                        "Action_logstd": np.concatenate(action_dist_logstd),
                        "Done": np.asarray(done),
                        "Reward": np.asarray(rewards)
                    }
                    paths.append(path)
                    break

            timesteps += len(rewards)
        #print('%d episodes, %d steps collected for batch' % (self.num_epi, timesteps))
        return paths

    #Simulate/Visualize latest policy
    def sim(self):
        observation = self.env.reset()
        total = 0
        for t in range(600):
            #Render environment
            self.env.render()

            #Get action from NN policy
            obs_expanded = np.expand_dims(np.squeeze(observation), 0)

            #Get action distribution from policy network
            action_dist_mu, action_dist_logstd = self.sess.run(
                [self.action_dist_mu, self.action_dist_logstd],
                feed_dict={self.obs: obs_expanded})

            #Sample action from gaussian distribution
            action_rl = np.random.normal(loc=action_dist_mu,
                                         scale=np.exp(action_dist_logstd))

            #Get compensatory barrier action
            u_BAR_ = self.bar_comp.get_action(obs_expanded)
            u_RL = action_rl + u_BAR_

            #Compensate with barrier-based control
            u_bar = self.control_barrier(obs_expanded, u_RL)
            action = u_bar + u_RL

            observation, reward, done, info = self.env.step(action)
            total = total + reward
            if done:
                print("Accumulated Reward: {}".format(total))
                break
Example #7
0
class TRPO():
    def __init__(self, args, env, sess):
        self.args = args
        self.sess = sess
        self.env = env

        #Set up observation space and action space
        self.observation_space = env.observation_space
        self.action_space = env.action_space
        print('Observation space', self.observation_space)
        print('Action space', self.action_space)

        #Determine dimensions of observation & action space
        self.observation_size = self.env.observation_space.shape[0]
        self.action_size = self.action_space.shape[0]

        # Build neural network model for observations/actions
        self.build_model()

    def build_model(self):
        self.obs = tf.placeholder(tf.float32, [None, self.observation_size])
        self.action = tf.placeholder(tf.float32, [None, self.action_size])
        self.advantage = tf.placeholder(tf.float32, [None])

        #Mean of old action distribution
        self.old_action_dist_mu = tf.placeholder(tf.float32, [None, self.action_size])
        self.old_action_dist_logstd = tf.placeholder(tf.float32, [None, self.action_size])

        #NN framework for action distribution
        self.action_dist_mu, action_dist_logstd = self.build_policy(self.obs)

        #Construct distribution by repeating action_dis_logstd
        self.action_dist_logstd = tf.tile(action_dist_logstd, (tf.shape(action_dist_logstd)[0],1))

        #Probability of action under old policy vs. new policy
        self.log_policy = LOG_POLICY(self.action_dist_mu, self.action_dist_logstd, self.action)
        self.log_old_policy = LOG_POLICY(self.old_action_dist_mu, self.old_action_dist_logstd, self.action)
        policy_ratio = tf.exp(self.log_policy - self.log_old_policy)
        
        #Number of observations in batch
        batch_size = tf.cast(tf.shape(self.obs)[0], tf.float32)
        
        '''
        Equation (14) in paper
        Contribution of a single s_n : Expectation over a~q[ (new policy / q(is)) * advantage_old]
        '''
        surr_single_state = -tf.reduce_mean(policy_ratio*self.advantage)
        
        
        #Define KL divergence and shannon entropy, averaged over a set of inputs (policies)        
        kl = GAUSS_KL(self.old_action_dist_mu, self.old_action_dist_logstd, self.action_dist_mu, self.action_dist_logstd) / batch_size
        ent = GAUSS_ENTROPY(self.action_dist_mu, self.action_dist_logstd) / batch_size
        
        
        #Define 'loss' quantities to constrain or maximize
        self.losses = [surr_single_state, kl, ent]
        
        # Get trainable variables for the policy (NN weights)
        tr_vrbs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Policy')
        for i in tr_vrbs:
            print(i.op.name)
        
        # Maximize surrogate function over policy parameter 'theta' represented by neural network weights
        self.pg = FLAT_GRAD(surr_single_state, tr_vrbs)
        
        #KL divergence where first argument is fixed
        kl_first_fixed = GAUSS_KL_FIRST_FIX(self.action_dist_mu, self.action_dist_logstd) / batch_size
        
        #Gradient of KL divergence w.r.t. theta (NN policy weights)
        first_kl_grads = tf.gradients(kl_first_fixed, tr_vrbs)
        
        '''
        REVIEW FROM HERE ONWARDS
        #??????????????????????????????????????????????????????????
        '''
        self.flat_tangent = tf.placeholder(tf.float32,[None])
        tangent = list()
        start = 0
        for vrbs in tr_vrbs:
            variable_size = np.prod(vrbs.get_shape().as_list())
            param = tf.reshape(self.flat_tangent[start:(start+variable_size)], vrbs.get_shape())
            tangent.append(param)
            start += variable_size
        '''
            Gradient of KL with tangent vector
            gradient_w_tangent : list of KL_prime*y for each variables
        '''
        gradient_w_tangent = [tf.reduce_sum(kl_g*t) for (kl_g, t) in zip(first_kl_grads, tangent)]
        
        '''
			From derivative of KL_prime*y : [dKL/dx1, dKL/dx2...]*y
				y -> Ay, A is n by n matrix but hard to implement(numerically solving (n*n)*(n*1))
				so first multiply target 'y' to gradient and take derivation
		    'self.FVP'	Returns : [d2KL/dx1dx1+d2KL/dx1dx2..., d2KL/dx1dx2+d2KL/dx2dx2..., ...]*y
			So get (second derivative of KL divergence)*y for each variable => y->JMJy (Fisher Vector Product)
		'''
        self.FVP = FLAT_GRAD(gradient_w_tangent, tr_vrbs)
        
        #Get actual parameter value
        self.get_value = GetValue(self.sess, tr_vrbs, name='Policy')
        
        #Set parameter values
        self.set_value = SetValue(self.sess, tr_vrbs, name='Policy')
        
        #Estimate of the advantage function 
        self.gae = GAE(self.sess, self.observation_size, self.args.gamma, self.args.lamda, self.args.vf_constraint)
        
        self.sess.run(tf.global_variables_initializer())
        
        
    def train(self):
        batch_path = self.rollout()
        theta_prev = self.get_value()
        
        #Get advantage from gae (train value function NN)
        advantage_estimated = self.gae.get_advantage(batch_path)
        
        #Put all paths in batch in a numpy array to feed to network as [batch size, action/observation size]
        #Those batches come from OLD policy before updating theta
        action_dist_mu = np.squeeze(np.concatenate([each_path["Action_mu"] for each_path in batch_path]))
        action_dist_logstd = np.squeeze(np.concatenate([each_path["Action_logstd"] for each_path in batch_path]))
        observation = np.squeeze(np.concatenate([each_path["Observation"] for each_path in batch_path]))
        action = np.squeeze(np.concatenate([each_path["Action"] for each_path in batch_path]))
        
        #Obtain policy gradient of advantage function w.r.t. theta (g in paper)
        feed_dict = {self.obs:observation, self.action:np.expand_dims(action, axis=1), self.advantage:advantage_estimated, self.old_action_dist_mu:np.expand_dims(action_dist_mu, axis=1), self.old_action_dist_logstd:np.expand_dims(action_dist_logstd, axis=1)}
        #feed_dict = {self.obs:observation, self.action:action, self.advantage:advantage_estimated, self.old_action_dist_mu:action_dist_mu, self.old_action_dist_logstd:action_dist_logstd}
        policy_g = self.sess.run(self.pg, feed_dict=feed_dict)
        
        # Computing fisher vector product : FIM * (policy gradient) where FIM = Fisher Information Matrix
        def fisher_vector_product(gradient):
            feed_dict[self.flat_tangent] = gradient
            return self.sess.run(self.FVP, feed_dict=feed_dict)
            
        #Solve Ax = g, where A is FIM and g is gradient of policy network, to obtain search direction for theta
        search_direction = CONJUGATE_GRADIENT(fisher_vector_product, -policy_g)
        
        #KL divergence approximated by 1/2*(delta_transpose)*FIM*delta
        #Appendix C in TRPO Paper
        kl_approximated = 0.5*search_direction.dot(fisher_vector_product(search_direction))
        
        #Calculate theta update
        maximal_step_length = np.sqrt(self.args.kl_constraint / kl_approximated)
        full_step = maximal_step_length * search_direction
        
        def surrogate(theta):
            self.set_value(theta)
            return self.sess.run(self.losses[0], feed_dict=feed_dict)
            
        #Use line search to ensure improvement of surrogate objective and satisfaction of KL constraint
        #Start with maximal step length and exponentially shrink until objective improves
        new_theta = LINE_SEARCH(surrogate, theta_prev, full_step, self.args.num_backtracking, name='Surrogate loss')
        
        #Update without line search
        #new_theta = theta_prev + full_step
        
        #Update policy parameter theta
        self.set_value(new_theta, update_info=0)
        
        #Update value function neural network
        #Policy update is performed using old value function parameter
        self.gae.train()
        
        #After update, store values at log
        surrogate_after, kl_after, _ = self.sess.run(self.losses, feed_dict=feed_dict)
        logs = {"Surrogate loss":surrogate_after, "KL_DIV":kl_after}
        logs["Total Step"] = sum([len(path["Reward"]) for path in batch_path])
        logs["Num episode"] = len([path["Reward"] for path in batch_path])
        logs["Total Sum"] = sum([sum(path["Reward"]) for path in batch_path])
        logs["Episode_Avg_Reward"] = logs["Total Sum"] / logs["Num episode"]
        logs["Final_Action"] = np.squeeze(np.concatenate([each_path["Action"] for each_path in batch_path]))
        logs["Observation"] = np.squeeze(np.concatenate([each_path["Observation"] for each_path in batch_path]))
        logs["Reward"] = np.squeeze(np.concatenate([each_path["Reward"] for each_path in batch_path]))
        return logs
        

    #USE SOFTMAX RELU INSTEAD OF RELU, OUTPUT WEIGHTS/BIASES IN EASIER FORMAT   
    #Set up NN to parameterize the control policy
    def build_policy(self, states, name='Policy'):
        print('Initializing Policy network')
        with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
            h1 = LINEAR(states, self.args.hidden_size, name='h1')
            #h1_n1 = tf.nn.relu(h1)
            h1_n1 = tf.nn.softmax(h1)
            h2 = LINEAR(h1_n1, self.args.hidden_size, name='h2')
            #h2_n1 = tf.nn.relu(h2)
            h2_n1 = tf.nn.softmax(h2)
            h3 = LINEAR(h2_n1, self.action_size, name='h3')

            init = lambda shape, dtype, partition_info=None : 0.01*np.random.randn(*shape)
            action_dist_logstd = tf.get_variable('logstd', initializer=init, shape=[1, self.action_size])
            
        return h3, action_dist_logstd
    
    def act(self, obs):
        #Need to expand first dimension (batch axis), make [1, observation size]
        obs_expanded = np.expand_dims(np.squeeze(obs), 0)
        #obs_expanded = obs
        #Get action distribution from policy network
        action_dist_mu, action_dist_logstd = self.sess.run([self.action_dist_mu, self.action_dist_logstd], feed_dict={self.obs:obs_expanded})
        #Sample action from gaussian distribution
        action = np.random.normal(loc=action_dist_mu, scale=np.exp(action_dist_logstd))
        return action, action_dist_mu, action_dist_logstd
    
    def rollout(self):
        #Initialize variables
        paths = list()
        timesteps = 0
        self.num_epi = 0
        #Iterate through the specified number of episodes
        while timesteps < self.args.timesteps_per_batch:
            self.num_epi += 1
            
            #Reset the environment
            obs, action, rewards, done, action_dist_mu, action_dist_logstd = [], [], [], [], [], []
            prev_obs = self.env.reset()
            
            #Simulate dynamics for specified time
            for i in range(self.args.max_path_length):
                prev_obs_expanded = np.expand_dims(np.squeeze(prev_obs), 0)
                #prev_obs_expanded = prev_obs
                #Agent takes actions from sampled action and action distribution parameters based on observation
                #All have shape of [1, action size]
                action_, action_dist_mu_, action_dist_logstd_ = self.act(prev_obs)
                
                #Store observation and action/distribution
                obs.append(prev_obs_expanded)
                action.append(action_)
                action_dist_mu.append(action_dist_mu_)
                action_dist_logstd.append(action_dist_logstd_)
                
                # Simulate dynamics after action
                next_obs, reward_, done_, _ = self.env.step(action_)
                
                #Get results
                done.append(done_)
                rewards.append(reward_)
                prev_obs = next_obs
                
                if done_:
                    path = {"Observation":np.concatenate(obs),
                            "Action":np.concatenate(action),
                            "Action_mu":np.concatenate(action_dist_mu),
                            "Action_logstd":np.concatenate(action_dist_logstd),
                            "Done":np.asarray(done),
                            "Reward":np.asarray(rewards)}
                    paths.append(path)
                    break
                
            timesteps += len(rewards)
        #print('%d episodes, %d steps collected for batch' % (self.num_epi, timesteps))
        return paths
                
    def sim(self):
        observation = self.env.reset()
        total = 0
        for t in range(600):
            #Render environment
            self.env.render()
            
            #Get action from NN policy
            obs_expanded = np.expand_dims(np.squeeze(observation), 0)
            #obs_expanded = obs
            #Get action distribution from policy network
            action_dist_mu, action_dist_logstd = self.sess.run([self.action_dist_mu, self.action_dist_logstd], feed_dict={self.obs:obs_expanded})
            #Sample action from gaussian distribution
            action = np.random.normal(loc=action_dist_mu, scale=np.exp(action_dist_logstd))
            
            observation, reward, done, info = self.env.step(action)
            total = total + reward
            if done:
                print("Accumulated Reward: {}".format(total))
                break                
Example #8
0
class TRPO():
    def __init__(self, args, env, sess, prior):
        self.num_epi = 0
        self.args = args
        self.sess = sess
        self.env = env
        self.prior = prior                
        self.observation_space = self.env.observation_space
        self.action_space = self.env.action_space
        print('Observation space', self.observation_space)
        print('Action space', self.action_space)
        # 'Box' observation_space and 'Box' action_space
        self.observation_size = self.env.observation_space.shape[0]
        # np.prod : return the product of array element over a given axis
        self.action_size = self.action_space.shape[0]

        # Build model and create variables
        self.build_model()

    def build_model(self):
        self.obs = tf.placeholder(tf.float32, [None, self.observation_size])
        self.action = tf.placeholder(tf.float32, [None, self.action_size])
        self.advantage = tf.placeholder(tf.float32, [None])
        # Mean of old action distribution
        self.old_action_dist_mu = tf.placeholder(tf.float32, [None, self.action_size])
        self.old_action_dist_logstd = tf.placeholder(tf.float32, [None, self.action_size])
        '''
        Mean value for each action : each action has gaussian distribution with mean and standard deviation
        With continuous state and action space, use GAUSSIAN DISTRIBUTION, maps  from the input features to the mean of Gaussian distribution for each action
        Seperate set of parameters specifies the log standard deviation of each action
        => The policy is defined by the normnal distribution (mean=NeuralNet(states), stddev= exp(r))
        '''
        self.action_dist_mu, action_dist_logstd = self.build_policy(self.obs)
        # Make log standard shape from [1, action size] => [batch size, action size]
        # tf.tile(A, reps) : construct an tensor by repeating A given by 'reps'
        # Use tf.shape instead of tf.get_shape() when 'None' used in placeholder
        self.action_dist_logstd = tf.tile(action_dist_logstd, (tf.shape(action_dist_logstd)[0], 1))

        # outputs probability of taking 'self.action'
        # new distribution  
        self.log_policy = LOG_POLICY(self.action_dist_mu, self.action_dist_logstd, self.action)
        # old distribution
        self.log_old_policy = LOG_POLICY(self.old_action_dist_mu, self.old_action_dist_logstd, self.action)
        
        # Take exponential to log policy distribution
        '''
        Equation (14) in paper
        Contribution of a single s_n : Expectation over a~q[(new policy / q(is)) * advantace_old]
        sampling distribution q is normally old policy
        '''
        batch_size = tf.shape(self.obs)[0]
        # print('Batch size %d' % batch_size)
        policy_ratio = tf.exp(self.log_policy - self.log_old_policy)
        surr_single_state = -tf.reduce_mean(policy_ratio * self.advantage)
        # tf.shape returns dtype=int32, tensor conversion requested dtype float32
        batch_size = tf.cast(batch_size, tf.float32)
        # Average KL divergence and shannon entropy, averaged over a set of inputs to function mu 
        kl = GAUSS_KL(self.old_action_dist_mu, self.old_action_dist_logstd, self.action_dist_mu, self.action_dist_logstd) / batch_size
        ent = GAUSS_ENTROPY(self.action_dist_mu, self.action_dist_logstd) / batch_size

        self.losses = [surr_single_state, kl, ent]
        #tr_vrbs = tf.trainable_variables()
        tr_vrbs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Policy')
        for i in tr_vrbs:
            print(i.op.name)

        '''
            Compute a search direction using a linear approx to objective and quadratic approx to constraint
            => The search direction is computed by approximately solving 'Ax=g' where A is FIM
                Quadratic approximation to KL divergence constraint
        '''
        # Maximize surrogate function over policy parameter 'theta'
        self.pg = FLAT_GRAD(surr_single_state, tr_vrbs)
        # KL divergence where first argument is fixed
        # First argument would be old policy parameters, so keep it constant
        kl_first_fixed = GAUSS_KL_FIRST_FIX(self.action_dist_mu, self.action_dist_logstd) / batch_size
        # Gradient of KL divergence
        first_kl_grads = tf.gradients(kl_first_fixed, tr_vrbs)
        # Vectors we are going to multiply
        self.flat_tangent = tf.placeholder(tf.float32, [None])
        tangent = list()
        start = 0
        for vrbs in tr_vrbs:
            variable_size = np.prod(vrbs.get_shape().as_list())
            param = tf.reshape(self.flat_tangent[start:(start+variable_size)], vrbs.get_shape())
            tangent.append(param)
            start += variable_size
        '''
            Gradient of KL with tangent vector
            gradient_w_tangent : list of KL_prime*y for each variables  
        '''
        gradient_w_tangent = [tf.reduce_sum(kl_g*t) for (kl_g, t) in zip(first_kl_grads, tangent)]
        '''
            From derivative of KL_prime*y : [dKL/dx1, dKL/dx2...]*y
                y -> Ay, A is n by n matrix but hard to implement(numerically solving (n*n)*(n*1))
                so first multiply target 'y' to gradient and take derivation
            'self.FVP'  Returns : [d2KL/dx1dx1+d2KL/dx1dx2..., d2KL/dx1dx2+d2KL/dx2dx2..., ...]*y
            So get (second derivative of KL divergence)*y for each variable => y->JMJy (Fisher Vector Product)
        '''
        self.FVP = FLAT_GRAD(gradient_w_tangent, tr_vrbs)
        # Get actual paramenter value
        self.get_value = GetValue(self.sess, tr_vrbs, name='Policy')
        # To set parameter values
        self.set_value = SetValue(self.sess, tr_vrbs, name='Policy')
        # GAE
        self.gae = GAE(self.sess, self.observation_size, self.args.gamma, self.args.lamda, self.args.vf_constraint)
    
        self.sess.run(tf.global_variables_initializer())        

    def train(self):
        batch_path = self.rollout()
        theta_prev = self.get_value()
        # Get advantage from gae
        advantage_estimated = self.gae.get_advantage(batch_path)

        # Put all paths in batch in a numpy array to feed to network as [batch size, action/observation size]
        # Those batches come from old policy before update theta 
        action_dist_mu = np.squeeze(np.concatenate([each_path["Action_mu"] for each_path in batch_path]))
        action_dist_logstd = np.squeeze(np.concatenate([each_path["Action_logstd"] for each_path in batch_path]))
        observation = np.squeeze(np.concatenate([each_path["Observation"] for each_path in batch_path]))
        action = np.squeeze(np.concatenate([each_path["Action"] for each_path in batch_path]))
        
        feed_dict = {self.obs : observation , self.action : np.expand_dims(np.squeeze(action),1), self.advantage : advantage_estimated, self.old_action_dist_mu : np.expand_dims(np.squeeze(action_dist_mu),1), self.old_action_dist_logstd : np.expand_dims(np.squeeze(action_dist_logstd),1)}
        # Computing fisher vector product : FIM * (policy gradient), y->Ay=JMJy
        def fisher_vector_product(gradient):
            feed_dict[self.flat_tangent] = gradient 
            return self.sess.run(self.FVP, feed_dict=feed_dict)

        policy_g = self.sess.run(self.pg, feed_dict=feed_dict)
        '''
            Linearize to objective function gives : objective_gradient * (theta-theta_old) = g.transpose * delta
            Quadratize to kl constraint : 1/2*(delta_transpose)*FIM*(delta)
            By Lagrangian => FIM*delta = gradient
        '''
        # Solve Ax = g, where A is FIM and g is gradient of policy network parameter
        # Compute a search direction(delta) by conjugate gradient algorithm
        search_direction = CONJUGATE_GRADIENT(fisher_vector_product, -policy_g)

        # KL divergence approximated by 1/2*(delta_transpose)*FIM*(delta)
        # FIM*(delta) can be computed by fisher_vector_product
        # a.dot(b) = a.transpose * b
        kl_approximated = 0.5*search_direction.dot(fisher_vector_product(search_direction))
        # beta
        maximal_step_length = np.sqrt(self.args.kl_constraint / kl_approximated)
        # beta*s
        full_step = maximal_step_length * search_direction

        def surrogate(theta):
            self.set_value(theta)
            return self.sess.run(self.losses[0], feed_dict=feed_dict)

        # Last, we use a line search to ensure improvement of the surrogate objective and sttisfaction of the KL constraint by manually control valud of parameter
        # Start with the maximal step length and exponentially shrink until objective improves
        new_theta = LINE_SEARCH(surrogate, theta_prev, full_step, self.args.num_backtracking, name='Surrogate loss')
        # Update policy parameter theta 
        self.set_value(new_theta, update_info=1)

        # Update value function parameter
        # Policy update is perfomed using the old value function parameter  
        self.gae.train()

        # After update, store values at log
        surrogate_after, kl_after, _ = self.sess.run(self.losses, feed_dict=feed_dict)  
        logs = {"Surrogate loss":surrogate_after, "KL_DIV":kl_after}
        logs["Total Step"] = sum([len(path["Reward"]) for path in batch_path])
        logs["Num episode"] = len([path["Reward"] for path in batch_path])
        logs["Total Sum"] = sum([sum(path["Reward"]) for path in batch_path])
        logs["Diff Sum"] = sum([path["Reward_diff"] for path in batch_path])
        logs["Episode_Avg_reward"] = logs["Total Sum"] / logs["Num episode"]
        logs["Episode_Avg_diff"] = logs["Diff Sum"] / logs["Num episode"]
        return logs


    # Make policy network given states
    def build_policy(self, states, name='Policy'):
        print('Initializing Policy network')
        with tf.variable_scope(name):
            h1 = LINEAR(states, self.args.hidden_size, name='h1')
            h1_nl = tf.nn.relu(h1)
            h2 = LINEAR(h1_nl, self.args.hidden_size, name='h2')
            h2_nl = tf.nn.relu(h2)
            h3 = LINEAR(h2_nl, self.action_size, name='h3')
            # tf.initializer has to be either Tensor object or 'callable' that takes two arguments (shape, dtype)
            init = lambda shape, dtype, partition_info=None : 0.01*np.random.randn(*shape)
            # [1, action size] since it has to be constant through batch axis, log standard deviation
            action_dist_logstd = tf.get_variable('logstd', initializer=init, shape=[1, self.action_size])
        
        return h3, action_dist_logstd
        
    def act(self, obs):
        # Need to expand first dimension(batch axis), make [1, observation size]
        obs_expanded = np.expand_dims(obs, 0)
        action_dist_mu, action_dist_logstd = self.sess.run([self.action_dist_mu, self.action_dist_logstd], feed_dict={self.obs:obs_expanded})
        # Sample from gaussian distribution
        action = np.random.normal(loc=action_dist_mu, scale=np.exp(action_dist_logstd))
        # All shape would be [1, action size]
#       print(action)
        return action, action_dist_mu, action_dist_logstd

    def rollout(self):
        # Set tuning parameters to obtain adaptive regularization weight
        lambda_store = np.zeros(int(self.args.timesteps_per_batch))
        lambda_max = 6
        factor = 0.3
        
        paths = list()
        timesteps = 0
        counter = 0
        #self.num_epi = 0
        while timesteps < self.args.timesteps_per_batch:
            self.num_epi += 1
            # print('%d episode starts' % self.num_epi)
            obs, action, rewards, done, action_dist_mu, action_dist_logstd, reward_diff = [], [], [], [], [], [], []

            # Baseline reward using only control prior
            s0 = self.env.reset_inc()
            sp = np.copy(s0)
            reward_prior = 0.
            for i in range(self.args.max_path_length):
                a_prior = self.env.getPrior()
                sp, reward_p, done_p, _ = self.env.step(a_prior)
                reward_prior += reward_p
                if done_p:
                    break

                
            prev_obs = self.env.reset()
            ep_reward = 0.
            for i in range(self.args.max_path_length):
                # Make 'batch size' axis
                prev_obs = np.squeeze(prev_obs)
                prev_obs_expanded = np.expand_dims(prev_obs, 0)

                # Obtain regularization weight using TD-error
                if (i > 0 and self.num_epi > 40):
                    # Obtain TD-error
                    base_v = self.gae.predict(old_obs[np.newaxis,:])
                    target_v = self.gae.predict(prev_obs[np.newaxis,:])
                    lambda_mix = lambda_max*(1 - np.exp(-factor*np.abs(reward_ + self.args.gamma*np.squeeze(target_v) - np.squeeze(base_v))))
                else:
                    self.lambda_actual = 5.
                    lambda_mix = self.lambda_actual
                if counter < len(lambda_store):
                    lambda_store[counter] = lambda_mix
                    counter += 1

                
                # Prior control
                a_prior = self.env.getPrior()

                #All has shape of [1, action size]
                action_, action_dist_mu_, action_dist_logstd_ = self.act(prev_obs)
                                
                # Mix the actions (RL controller and control prior)
                act = action_/(1+self.lambda_actual) + (self.lambda_actual/(1+self.lambda_actual))*a_prior
                                
                # Take action
                #next_obs, reward_, done_, _ = self.env.step(action_)
                old_obs = prev_obs
                next_obs, reward_, done_, _ = self.env.step(act)
                ep_reward += reward_
                # Store observation                                                               
                obs.append(prev_obs_expanded)
                action.append(action_)
                action_dist_mu.append(action_dist_mu_)
                action_dist_logstd.append(action_dist_logstd_)
                done.append(done_)
                rewards.append(reward_)
                # print(prev_obs, action_, reward_, next_obs, done_)
                prev_obs = next_obs
                if done_:
                    # Make dictionary about path, make each element has shape of [None, observation size/action size]
                    path = {"Observation":np.concatenate(obs),
                    "Action":np.concatenate(action),
                    "Action_mu":np.concatenate(action_dist_mu),
                    "Action_logstd":np.concatenate(action_dist_logstd),
                    # [length,]
                    "Done":np.asarray(done),
                    "Reward":np.squeeze(np.asarray(rewards))[:,np.newaxis],
                    "Reward_diff":np.squeeze(np.asarray(ep_reward - reward_prior))}
                    paths.append(path)
                    #print('%d episode finish at %d steps' % (self.num_epi, i+1))
                    #print(self.lambda_actual)
                    break
            timesteps += len(rewards)
        # print('%d steps collected for batch' % timesteps)
        #print('%d episodes, %d steps is collected for batch' % (self.num_epi, timesteps))
        self.lambda_actual = np.mean(lambda_store)
        print(self.lambda_actual)
        return paths
Example #9
0
    def build_model(self):
        self.obs = tf.placeholder(tf.float32, [None, self.observation_size])
        self.action = tf.placeholder(tf.float32, [None, self.action_size])
        self.advantage = tf.placeholder(tf.float32, [None])
        # Mean of old action distribution
        self.old_action_dist_mu = tf.placeholder(tf.float32, [None, self.action_size])
        self.old_action_dist_logstd = tf.placeholder(tf.float32, [None, self.action_size])
        '''
        Mean value for each action : each action has gaussian distribution with mean and standard deviation
        With continuous state and action space, use GAUSSIAN DISTRIBUTION, maps  from the input features to the mean of Gaussian distribution for each action
        Seperate set of parameters specifies the log standard deviation of each action
        => The policy is defined by the normnal distribution (mean=NeuralNet(states), stddev= exp(r))
        '''
        self.action_dist_mu, action_dist_logstd = self.build_policy(self.obs)
        # Make log standard shape from [1, action size] => [batch size, action size]
        # tf.tile(A, reps) : construct an tensor by repeating A given by 'reps'
        # Use tf.shape instead of tf.get_shape() when 'None' used in placeholder
        self.action_dist_logstd = tf.tile(action_dist_logstd, (tf.shape(action_dist_logstd)[0], 1))

        # outputs probability of taking 'self.action'
        # new distribution  
        self.log_policy = LOG_POLICY(self.action_dist_mu, self.action_dist_logstd, self.action)
        # old distribution
        self.log_old_policy = LOG_POLICY(self.old_action_dist_mu, self.old_action_dist_logstd, self.action)
        
        # Take exponential to log policy distribution
        '''
        Equation (14) in paper
        Contribution of a single s_n : Expectation over a~q[(new policy / q(is)) * advantace_old]
        sampling distribution q is normally old policy
        '''
        batch_size = tf.shape(self.obs)[0]
        # print('Batch size %d' % batch_size)
        policy_ratio = tf.exp(self.log_policy - self.log_old_policy)
        surr_single_state = -tf.reduce_mean(policy_ratio * self.advantage)
        # tf.shape returns dtype=int32, tensor conversion requested dtype float32
        batch_size = tf.cast(batch_size, tf.float32)
        # Average KL divergence and shannon entropy, averaged over a set of inputs to function mu 
        kl = GAUSS_KL(self.old_action_dist_mu, self.old_action_dist_logstd, self.action_dist_mu, self.action_dist_logstd) / batch_size
        ent = GAUSS_ENTROPY(self.action_dist_mu, self.action_dist_logstd) / batch_size

        self.losses = [surr_single_state, kl, ent]
        #tr_vrbs = tf.trainable_variables()
        tr_vrbs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Policy')
        for i in tr_vrbs:
            print(i.op.name)

        '''
            Compute a search direction using a linear approx to objective and quadratic approx to constraint
            => The search direction is computed by approximately solving 'Ax=g' where A is FIM
                Quadratic approximation to KL divergence constraint
        '''
        # Maximize surrogate function over policy parameter 'theta'
        self.pg = FLAT_GRAD(surr_single_state, tr_vrbs)
        # KL divergence where first argument is fixed
        # First argument would be old policy parameters, so keep it constant
        kl_first_fixed = GAUSS_KL_FIRST_FIX(self.action_dist_mu, self.action_dist_logstd) / batch_size
        # Gradient of KL divergence
        first_kl_grads = tf.gradients(kl_first_fixed, tr_vrbs)
        # Vectors we are going to multiply
        self.flat_tangent = tf.placeholder(tf.float32, [None])
        tangent = list()
        start = 0
        for vrbs in tr_vrbs:
            variable_size = np.prod(vrbs.get_shape().as_list())
            param = tf.reshape(self.flat_tangent[start:(start+variable_size)], vrbs.get_shape())
            tangent.append(param)
            start += variable_size
        '''
            Gradient of KL with tangent vector
            gradient_w_tangent : list of KL_prime*y for each variables  
        '''
        gradient_w_tangent = [tf.reduce_sum(kl_g*t) for (kl_g, t) in zip(first_kl_grads, tangent)]
        '''
            From derivative of KL_prime*y : [dKL/dx1, dKL/dx2...]*y
                y -> Ay, A is n by n matrix but hard to implement(numerically solving (n*n)*(n*1))
                so first multiply target 'y' to gradient and take derivation
            'self.FVP'  Returns : [d2KL/dx1dx1+d2KL/dx1dx2..., d2KL/dx1dx2+d2KL/dx2dx2..., ...]*y
            So get (second derivative of KL divergence)*y for each variable => y->JMJy (Fisher Vector Product)
        '''
        self.FVP = FLAT_GRAD(gradient_w_tangent, tr_vrbs)
        # Get actual paramenter value
        self.get_value = GetValue(self.sess, tr_vrbs, name='Policy')
        # To set parameter values
        self.set_value = SetValue(self.sess, tr_vrbs, name='Policy')
        # GAE
        self.gae = GAE(self.sess, self.observation_size, self.args.gamma, self.args.lamda, self.args.vf_constraint)
    
        self.sess.run(tf.global_variables_initializer())        
def main():
    # Get arguments parsed
    args = get_args()

    # Setup for logging
    output_dir = 'output/{}'.format(
        datetime.now(
            timezone('Asia/Hong_Kong')).strftime('%Y-%m-%d_%H-%M-%S-%f')[:-3])
    create_dir(output_dir)
    LogHelper.setup(log_path='{}/training.log'.format(output_dir),
                    level_str='INFO')
    _logger = logging.getLogger(__name__)

    # Save the configuration for logging purpose
    save_yaml_config(args, path='{}/config.yaml'.format(output_dir))

    # Reproducibility
    set_seed(args.seed)

    # Get dataset
    dataset = SyntheticDataset(args.n, args.d, args.graph_type, args.degree,
                               args.sem_type, args.noise_scale,
                               args.dataset_type, args.x_dim)
    _logger.info('Finished generating dataset')

    model = GAE(args.n, args.d, args.x_dim, args.seed, args.num_encoder_layers,
                args.num_decoder_layers, args.hidden_size, args.latent_dim,
                args.l1_graph_penalty, args.use_float64)
    model.print_summary(print_func=model.logger.info)

    trainer = ALTrainer(args.init_rho, args.rho_thres, args.h_thres,
                        args.rho_multiply, args.init_iter, args.learning_rate,
                        args.h_tol, args.early_stopping,
                        args.early_stopping_thres)
    W_est = trainer.train(model, dataset.X, dataset.W, args.graph_thres,
                          args.max_iter, args.iter_step, output_dir)
    _logger.info('Finished training model')

    # Save raw recovered graph, ground truth and observational data after training
    np.save('{}/true_graph.npy'.format(output_dir), dataset.W)
    np.save('{}/observational_data.npy'.format(output_dir), dataset.X)
    np.save('{}/final_raw_recovered_graph.npy'.format(output_dir), W_est)

    # Plot raw recovered graph
    plot_recovered_graph(
        W_est,
        dataset.W,
        save_name='{}/raw_recovered_graph.png'.format(output_dir))

    _logger.info('Filter by constant threshold')
    W_est = W_est / np.max(np.abs(W_est))  # Normalize

    # Plot thresholded recovered graph
    W_est[np.abs(W_est) < args.graph_thres] = 0  # Thresholding
    plot_recovered_graph(
        W_est,
        dataset.W,
        save_name='{}/thresholded_recovered_graph.png'.format(output_dir))
    results_thresholded = count_accuracy(dataset.W, W_est)
    _logger.info('Results after thresholding by {}: {}'.format(
        args.graph_thres, results_thresholded))