Exemple #1
0
    def __init__(self,
                 state_size,
                 action_size,
                 batch_size=128,
                 gamma=0.99,
                 mean_lambda=1e-3,
                 std_lambda=1e-3,
                 z_lambda=0.0):

        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.memory = ReplayBuffer(BUFFERSIZE, self.batch_size)

        self.mean_lambda = mean_lambda
        self.std_lambda = std_lambda
        self.z_lambda = z_lambda

        self.current_value = Value(state_size).to(device)
        self.target_value = Value(state_size).to(device)

        self.softQ = soft_Q(state_size, action_size)
        self.policy = Policy(state_size, action_size)

        self.value_optimizer = optim.Adam(self.current_value.parameters(),
                                          lr=3e-4)
        self.soft_q_optimizer = optim.Adam(self.softQ.parameters(), lr=3e-4)
        self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=3e-4)
Exemple #2
0
def trpo(args):
    env = gym.make(args.env_name)
    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]

    env.seed(args.seed)
    torch.manual_seed(args.seed)

    policy_net = Policy(num_inputs, num_actions)
    value_net = Value(num_inputs)
    
    running_state = ZFilter((num_inputs,), clip=5)
    running_reward = ZFilter((1,), demean=False, clip=10)
    
    reward_record = []
    global_steps = 0

    for i_episode in range(args.num_episode):
        memory = Memory()
        
        # sample data: single path method
        num_steps = 0
        while num_steps < args.batch_size:
            state = env.reset()
            state = running_state(state)
            
            reward_sum = 0
            for t in range(args.max_step_per_episode):
                action = select_single_action(policy_net, state)
                next_state, reward, done, _ = env.step(action)
                reward_sum += reward

                next_state = running_state(next_state)
                
                mask = 0 if done else 1
                
                memory.push(state, action, mask, next_state, reward)
                
                if done:
                    break
                    
                state = next_state
                
            num_steps += (t + 1)
            global_steps += (t + 1)
            reward_record.append({'steps': global_steps, 'reward': reward_sum})

        batch = memory.sample()
        batch_size = len(memory)
        
        # update params
        rewards = Tensor(batch.reward)
        masks = Tensor(batch.mask)
        actions = Tensor(batch.action)
        states = Tensor(batch.state)
        values = value_net(states)
        
        returns = Tensor(batch_size)
        deltas = Tensor(batch_size)
        advantages = Tensor(batch_size)

        prev_return = 0
        prev_value = 0
        prev_advantage = 0
        for i in reversed(range(batch_size)):
            returns[i] = rewards[i] + args.gamma * prev_return * masks[i]
            deltas[i] = rewards[i] + args.gamma * prev_value * masks[i] - values[i]
            # ref: https://arxiv.org/pdf/1506.02438.pdf (generalization advantage estimate)
            # notation following PPO paper
            advantages[i] = deltas[i] + args.gamma * args.lamda * prev_advantage * masks[i]

            prev_return = returns[i]
            prev_value = values[i]
            prev_advantage = advantages[i]
        advantages = (advantages - advantages.mean()) / (advantages.std() + EPS)
            
        # optimize value network
        loss_func_args = (value_net, states, returns)
        old_loss, _ = get_value_loss(value_net.get_flat_params(), *loss_func_args)
        flat_params, opt_loss, opt_info = sciopt.fmin_l_bfgs_b(get_value_loss, 
            value_net.get_flat_params(), args=loss_func_args, maxiter=args.value_opt_max_iter)
        value_net.set_flat_params(flat_params)
        print('ValueNet optimization: old loss = {}, new loss = {}'.format(old_loss, opt_loss))

        # optimize policy network
        # 1. find search direction for network parameter optimization, use conjugate gradient (CG)
        #       the direction can be found analytically, it s = - A^{-1} g, 
        #       where A is the Fisher Information Matrix (FIM) w.r.t. action probability distribution 
        #       and g is the gradient w.r.t. loss function \dfrac{\pi_\theta (a|s)}{q(a|s)} Q(s, a)
        policy_net.set_old_loss(states, actions)
        loss = policy_net.get_loss(states, actions, advantages)
        g = torch.autograd.grad(loss, policy_net.parameters())
        flat_g = torch.cat([grad.view(-1) for grad in g]).data
        Av = lambda v: policy_net.kl_hessian_times_vector(states, v)
        step_dir = conjugate_gradient(Av, - flat_g, nsteps=args.cg_nsteps)

        # 2. find maximum stepsize along the search direction
        #       the problem: min g * x  s.t. 1/2 * x^T * A * x <= delta
        #       can be solved analytically with x = beta * s
        #       where beta = sqrt(2 delta / s^T A s)
        sAs = 0.5 * (step_dir * Av(step_dir)).sum(0)
        beta = torch.sqrt(2 * args.max_kl / sAs)
        full_step = (beta * step_dir).data.numpy()

        # 3. do line search along the found direction, with maximum change = full_step
        #       the maximum change is restricted by the KL divergence constraint
        #       line search with backtracking method
        get_policy_loss = lambda x: policy_net.get_loss(states, actions, advantages)
        old_loss = get_policy_loss(None)
        success, new_params = line_search(policy_net, get_policy_loss, full_step, flat_g)
        policy_net.set_flat_params(new_params)
        new_loss = get_policy_loss(None)
        print('PolicyNet optimization: old loss = {}, new loss = {}'.format(old_loss, new_loss))

        if i_episode % args.log_num_episode == 0:
            print('Finished episode: {} Mean Reward: {}'.format(i_episode, reward_record[-1]))
            print('-----------------')
    
    policy_net.save_model_policy()
    value_net.save_model_value()
    return reward_record
Exemple #3
0
def main(args):
    policy = Cvae()
    policy_optimizer = optim.Adam(policy.parameters(), lr=args.lr_cvae)
    value_network = Value()
    value_optimizer = optim.Adam(value_network.parameters(), lr=args.lr_value)
    mse_loss = nn.MSELoss()

    env = gym.make('Acrobot-v1')

    time_str, trained_model = cvae_policy_train(env,
                                                policy,
                                                value_network,
                                                policy_optimizer,
                                                value_optimizer,
                                                mse_loss,
                                                args)

    # Test the trained model using argmax.
    env = gym.make('Acrobot-v1')
    if args.record:
        # Store results from different runs of the model separately.
        results_directory = ''.join(['/tmp', args.directory_name, '/test/', time_str, '_discounting_',
                                     str(args.gamma), '_update_frequency_', str(args.update_frequency),
                                     '_value_update_times_', str(args.value_update_times)])
        env = gym.wrappers.Monitor(env, results_directory)

    if not args.cuda:
        plt.ion()
        test_returns = []

    for i in range(args.test_time):
        state_ = env.reset()
        done = False
        cumulative_return = 0

        for timestep in range(0, 500):
            if not done:
                if not args.cuda:
                    env.render()
                state_ = th.from_numpy(state_.reshape(1, -1))
                state = Variable(state_, requires_grad=False).type(Tensor)
                padding = Variable(th.zeros(1, 3), requires_grad=False).type(Tensor)
                state_padded = th.cat([state, padding], 1)
                _, _, p = trained_model.forward(state_padded)
                action = th.max(p, 1)[1].data[0]
                next_state_, reward_, done, info_ = env.step(action)
                cumulative_return += (args.gamma ** timestep) * reward_
                state_ = next_state_

        test_returns.append(cumulative_return)

        print('====> Cumulative return: {}'.format(cumulative_return))

        plt.clf()
        plt.figure(1)
        plt.xlabel('episodes')
        plt.ylabel('cumulative returns')
        plt.plot(test_returns)
        plt.show()
        plt.savefig(''.join(['cvae/test/', time_str, '_discounting_',
                             str(args.gamma), '_update_frequency_', str(args.update_frequency),
                             '_value_update_times_', str(args.value_update_times)]) + '.png')

    if not args.cuda:
        plt.ioff()
        plt.close()

    env.close()