def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.env = env        
        self.action_range = [env.action_space.low, env.action_space.high]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau       

        # initialize networks 
        self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device)
        self.policy_net = GaussianPolicy(self.obs_dim, self.action_dim).to(self.device)
        
        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
            target_param.data.copy_(param)
            
        # initialize optimizers 
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)

        self.replay_buffer = Buffer(buffer_maxlen)
    def __init__(self, env_id, action_space, action_bound):

        self.env_id = env_id

        self.action_space = action_space

        self.action_bound = action_bound

        self.env = gym.make(self.env_id)

        self.replay_buffer = ReplayBuffer(max_len=self.MAX_EXPERIENCES)

        self.policy = GaussianPolicy(action_space=self.action_space,
                                     action_bound=self.action_bound)

        self.duqlqnet = DualQNetwork()

        self.target_dualqnet = DualQNetwork()

        self.log_alpha = tf.Variable(0.)  #: alpha=1

        self.alpha_optimizer = tf.keras.optimizers.Adam(3e-4)

        self.target_entropy = -0.5 * self.action_space

        self.global_steps = 0

        self._initialize_weights()
Beispiel #3
0
    def __init__(self, env, param=None):
        super(PPO, self).__init__(env, param=param)
        self.name = 'PPO'
        self.critic = ValueFunction(self.param.value , self.device)
        self.actor = GaussianPolicy(self.param.policy, self.device)
        self.steps = 0
        self.episode_steps = 0

        if self.param.LR_SCHEDULE:
            schedule = lambda epoch: 1 - epoch/(self.param.evaluation['total_timesteps'] // self.param.BATCH_SIZE)
        else:
            schedule = lambda epoch: 1
        self.actor_scheduler = optim.lr_scheduler.LambdaLR(self.actor.optimizer, schedule)
        self.critic_scheduler = optim.lr_scheduler.LambdaLR(self.critic.optimizer, schedule)
    def __init__(self, env, gamma, tau, alpha, q_lr, policy_lr, a_lr,
                 buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.action_range = [env.action_space.low, env.action_space.high]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau

        # initialize networks
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.target_q_net1 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.target_q_net2 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.policy_net = GaussianPolicy(self.obs_dim,
                                         self.action_dim).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_q_net1.parameters(),
                                       self.q_net1.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_q_net2.parameters(),
                                       self.q_net2.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        # entropy temperature
        self.alpha = alpha
        self.target_entropy = -torch.prod(
            torch.Tensor(self.env.action_space.shape).to(self.device)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optim = optim.Adam([self.log_alpha], lr=a_lr)

        self.replay_buffer = Buffer(buffer_maxlen)
Beispiel #5
0
def get_policy(args, env):
    N = env.observation_space.shape[0]
    M = env.action_space.shape[0]
    if args.init_policy == 'optimal':
        K = env.optimal_controller()
        mean_network = nn.Linear(*K.shape[::-1], bias=False)
        mean_network.weight.data = tensor(K)
    elif args.init_policy == 'linear':
        K = np.random.randn(M, N)
        mean_network = nn.Linear(*K.shape[::-1], bias=False)
        mean_network.weight.data = tensor(K)
    elif args.init_policy == 'linear_bias':
        K = np.random.randn(M, N)
        mean_network = nn.Linear(*K.shape[::-1], bias=True)
        mean_network.weight.data = tensor(K)
    elif args.init_policy == 'mlp':
        mean_network = get_mlp((N, ) + tuple(args.hidden_sizes) + (M, ),
                               gate=nn.Tanh)
    else:
        raise Exception('unsupported policy type')
    return GaussianPolicy(N,
                          M,
                          mean_network,
                          learn_std=not args.fix_std,
                          gate_output=args.gate_output)
Beispiel #6
0
    def __init__(self, params):

        action_size = params['action_size']
        state_size = params['state_size']
        buf_params = params['buf_params']

        nn_params = params['nn_params']
        nn_params['nn_policy']['l1'][0] = state_size
        nn_params['nn_policy']['l3'][1] = action_size
        nn_params['nn_value_function']['l1'][0] = state_size

        self.__policy = GaussianPolicy(nn_params['nn_policy']).to(device)
        self.__value_fn = ValueFunction(
            nn_params['nn_value_function']).to(device)

        self.__action_size = action_size
        self.__state_size = state_size
        self.__memory = Buffer(buf_params)

        self.gamma = params['gamma']
        self.learning_rate_policy = params['learning_rate_policy']
        self.learning_rate_value_fn = params['learning_rate_value_fn']
        self.tau = params['tau']

        self.updates_num = params['updates_num']
        self.ppo_epochs = params['ppo_epochs']
        self.baseline_epochs = params['baseline_epochs']
        self.ppo_eps = params['ppo_epsilon']

        self.__optimiser_policy = optim.Adam(self.__policy.parameters(),
                                             self.learning_rate_policy,
                                             weight_decay=1e-5)
        self.__optimiser_value_fn = optim.Adam(self.__value_fn.parameters(),
                                               self.learning_rate_value_fn)
        # other parameters
        self.agent_loss = 0.0
Beispiel #7
0
 def __init__(self, env, param=None):
     super(TRPO, self).__init__(env, param=param)
     self.name = "TRPO"
     self.critic = ValueFunction(self.param.value, self.device)
     self.actor = GaussianPolicy(self.param.policy, self.device)
     self.steps = 0
Beispiel #8
0
class AgentPPO:
    def __init__(self, params):

        action_size = params['action_size']
        state_size = params['state_size']
        buf_params = params['buf_params']

        nn_params = params['nn_params']
        nn_params['nn_policy']['l1'][0] = state_size
        nn_params['nn_policy']['l3'][1] = action_size
        nn_params['nn_value_function']['l1'][0] = state_size

        self.__policy = GaussianPolicy(nn_params['nn_policy']).to(device)
        self.__value_fn = ValueFunction(
            nn_params['nn_value_function']).to(device)

        self.__action_size = action_size
        self.__state_size = state_size
        self.__memory = Buffer(buf_params)

        self.gamma = params['gamma']
        self.learning_rate_policy = params['learning_rate_policy']
        self.learning_rate_value_fn = params['learning_rate_value_fn']
        self.tau = params['tau']

        self.updates_num = params['updates_num']
        self.ppo_epochs = params['ppo_epochs']
        self.baseline_epochs = params['baseline_epochs']
        self.ppo_eps = params['ppo_epsilon']

        self.__optimiser_policy = optim.Adam(self.__policy.parameters(),
                                             self.learning_rate_policy,
                                             weight_decay=1e-5)
        self.__optimiser_value_fn = optim.Adam(self.__value_fn.parameters(),
                                               self.learning_rate_value_fn)
        # other parameters
        self.agent_loss = 0.0

    # Set methods
    def set_learning_rate(self, lr_policy, lr_value_fn):
        self.learning_rate_policy = lr_policy
        self.learning_rate_value_fn = lr_value_fn
        for param_group in self.__optimiser_policy.param_groups:
            param_group['lr'] = lr_policy
        for param_group in self.__optimiser_value_fn.param_groups:
            param_group['lr'] = lr_value_fn

    # Get methods
    def get_actor(self):
        return self.__policy

    def get_critic(self):
        return self.__value_fn

    # Other methods
    def step(self, state, action, reward, next_state, done, log_probs):
        self.__memory.add(state, action, reward, next_state, done, log_probs)
        if self.__memory.is_full():
            experience = self.__memory.get_data()
            self.__update(experience)

    def choose_action(self, state, mode='train'):
        if mode == 'train':
            # state should be transformed to a tensor
            state = torch.from_numpy(np.array(state)).float().to(device)
            self.__policy.eval()
            with torch.no_grad():
                actions, log_probs, mean, std = self.__policy.sample_action(
                    state)
            self.__policy.train()
            return list(actions.cpu().numpy().squeeze()), log_probs.cpu(
            ).numpy(), mean.cpu().numpy(), std.cpu().numpy()
        elif mode == 'test':
            pass
        else:
            print("Invalid mode value")

    def reset(self, sigma):
        pass

    def __update(self, experience, batch_size=256):

        states, actions, rewards, next_states, dones, log_probs_old = list(
            experience)

        T = rewards.shape[0]
        last_return = torch.zeros(rewards.shape[1]).float().to(device)
        returns = torch.zeros(rewards.shape).float().to(device)

        for t in reversed(range(T)):
            last_return = rewards[t] + last_return * self.gamma * (1 -
                                                                   dones[t])
            returns[t] = last_return

        states = states.view(-1, self.__state_size)
        actions = actions.view(-1, self.__action_size)
        returns = returns.view(-1, 1)
        dones = dones.view(-1, 1)
        log_probs_old = log_probs_old.view(-1, 1)

        updates_num = states.shape[0] // batch_size
        # Critic update
        for _ in range(self.baseline_epochs):
            for _ in range(updates_num):

                idx = np.random.randint(0, states.shape[0], batch_size)
                states_batch = states[idx]
                returns_batch = returns[idx]

                self.__optimiser_value_fn.zero_grad()
                value_pred = self.__value_fn(states_batch).view(-1, 1)
                loss_fn = nn.MSELoss()
                value_loss = loss_fn(value_pred, returns_batch)
                value_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.__value_fn.parameters(),
                                               10)
                self.__optimiser_value_fn.step()

        # Policy update

        for _ in range(self.ppo_epochs):
            for _ in range(updates_num):

                idx = np.random.randint(0, states.shape[0], batch_size)
                states_batch = states[idx]
                actions_batch = actions[idx]
                returns_batch = returns[idx]
                log_probs_old_batch = log_probs_old[idx]

                advantages = (returns_batch -
                              self.__value_fn(states_batch).detach()).view(
                                  -1, 1)

                advantages = (advantages -
                              advantages.mean()) / advantages.std()

                self.__optimiser_policy.zero_grad()
                log_probs_batch = self.__policy.evaluate_actions(
                    states_batch, actions_batch).view(-1, 1)
                ratio = (log_probs_batch - log_probs_old_batch).exp()

                clipped_fn = torch.clamp(ratio * advantages,
                                         1.0 - self.ppo_eps,
                                         1.0 + self.ppo_eps)
                surrogate_fn = torch.min(ratio * advantages, clipped_fn)

                # entropy = F.kl_div(log_probs_batch.view(-1, 1), log_probs_old_batch.view(-1, 1))

                policy_loss = -surrogate_fn.mean()

                policy_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.__policy.parameters(), 10)
                self.__optimiser_policy.step()
Beispiel #9
0
def train(args):

    # Initialize data type
    dtype = torch.float32
    torch.set_default_dtype(dtype)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # Initialize environment
    env = gym.make(args.env_id)
    envname = env.spec.id
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Initialize random seeds
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # Initialize neural nets
    policy = GaussianPolicy(obs_dim, act_dim, args.hidden_size, args.activation, args.logstd)
    value_net = Value(obs_dim, args.hidden_size, args.activation)
    cvalue_net = Value(obs_dim, args.hidden_size, args.activation)
    policy.to(device)
    value_net.to(device)
    cvalue_net.to(device)

    # Initialize optimizer
    pi_optimizer = torch.optim.Adam(policy.parameters(), args.pi_lr)
    vf_optimizer = torch.optim.Adam(value_net.parameters(), args.vf_lr)
    cvf_optimizer = torch.optim.Adam(cvalue_net.parameters(), args.cvf_lr)

    # Initialize learning rate scheduler
    lr_lambda = lambda it: max(1.0 - it / args.max_iter_num, 0)
    pi_scheduler = torch.optim.lr_scheduler.LambdaLR(pi_optimizer, lr_lambda=lr_lambda)
    vf_scheduler = torch.optim.lr_scheduler.LambdaLR(vf_optimizer, lr_lambda=lr_lambda)
    cvf_scheduler = torch.optim.lr_scheduler.LambdaLR(cvf_optimizer, lr_lambda=lr_lambda)

    # Store hyperparameters for log
    hyperparams = vars(args)

    # Initialize RunningStat for state normalization, score queue, logger
    running_stat = RunningStats(clip=5)
    score_queue = deque(maxlen=100)
    cscore_queue = deque(maxlen=100)
    logger = Logger(hyperparams)

    # Get constraint bounds
    cost_lim = get_threshold(envname, constraint=args.constraint)

    # Initialize and train FOCOPS agent
    agent = FOCOPS(env, policy, value_net, cvalue_net,
                   pi_optimizer, vf_optimizer, cvf_optimizer,
                   args.num_epochs, args.mb_size,
                   args.c_gamma, args.lam, args.delta, args.eta,
                   args.nu, args.nu_lr, args.nu_max, cost_lim,
                   args.l2_reg, score_queue, cscore_queue, logger)

    start_time = time.time()

    for iter in range(args.max_iter_num):

        # Update iteration for model
        agent.logger.save_model('iter', iter)

        # Collect trajectories
        data_generator = DataGenerator(obs_dim, act_dim, args.batch_size, args.max_eps_len)
        rollout = data_generator.run_traj(env, agent.policy, agent.value_net, agent.cvalue_net,
                                          running_stat, agent.score_queue, agent.cscore_queue,
                                          args.gamma, args.c_gamma, args.gae_lam, args.c_gae_lam,
                                          dtype, device, args.constraint)

        # Update FOCOPS parameters
        agent.update_params(rollout, dtype, device)

        # Update learning rates
        pi_scheduler.step()
        vf_scheduler.step()
        cvf_scheduler.step()

        # Update time and running stat
        agent.logger.update('time', time.time() - start_time)
        agent.logger.update('running_stat', running_stat)

        # Save and print values
        agent.logger.dump()
Beispiel #10
0
class TRPO(BaseRL, OnPolicy):
    def __init__(self, env, param=None):
        super(TRPO, self).__init__(env, param=param)
        self.name = "TRPO"
        self.critic = ValueFunction(self.param.value, self.device)
        self.actor = GaussianPolicy(self.param.policy, self.device)
        self.steps = 0

    def act(self, state, deterministic=False):
        self.steps += 1
        with torch.no_grad():
            if self.steps < self.param.DELAYED_START:
                action = self.env.action_space.sample()
            else:
                self.actor.eval()
                action = self.actor(torch.from_numpy(state).float().to(
                    self.device),
                                    deterministic=deterministic).cpu().numpy()
            next_state, reward, done, _ = self.env.step(action)
            if not deterministic:
                done_bool = float(
                    done
                )  #if self.episode_steps < self.env._max_episode_steps else 0
                self.critic.eval()
                value, next_value = self.critic(
                    torch.from_numpy(np.stack([state, next_state
                                               ])).float().to(self.device))
                # value = self.critic(torch.from_numpy(state).float().to(self.device))
                # next_value = self.critic(torch.from_numpy(next_state).float().to(self.device))

                log_pi = self.actor.log_prob(
                    torch.from_numpy(state).float().to(self.device),
                    torch.from_numpy(action).float().to(self.device))
                self.memory.store(state, action, reward, next_state, done_bool,
                                  value, next_value, log_pi)
                if done:
                    self.memory.process_episode(
                        maximum_entropy=self.param.MAX_ENTROPY)
        return next_state, reward, done

    @OnPolicy.loop
    def learn(self):
        rollouts = self.onPolicyData
        returns = rollouts['returns_gae']
        if self.param.ADVANTAGE_NORMALIZATION:
            rollouts['advantages'] = (rollouts['advantages'] -
                                      rollouts['advantages'].mean()) / (
                                          rollouts['advantages'].std() + 1e-5)
        for _ in range(self.param.EPOCHS):
            # Compute Advantages
            for _ in range(self.param.VALUE_EPOCHS):
                # Update Critic
                values = self.critic(rollouts['states'])
                critic_loss = F.mse_loss(values, returns)
                self.critic.optimize(critic_loss)
            # Update Actor
            old_log_probs = self.actor.log_prob(rollouts['states'],
                                                rollouts['actions'])
            pg = self.policy_gradient(rollouts)
            npg = self.natural_gradient(pg, rollouts)
            parameters, pg_norm = self.linesearch(npg, pg, rollouts)
            self.optimize_actor(parameters)
            log_probs = self.actor.log_prob(rollouts['states'],
                                            rollouts['actions'])

        metrics = dict()
        with torch.no_grad():
            metrics['explained_variance'] = (
                1 -
                (rollouts['returns_mc'] - rollouts['values']).pow(2).sum() /
                (rollouts['returns_mc'] -
                 rollouts['returns_mc'].mean()).pow(2).sum()).item()
            metrics['entropy'] = self.actor.entropy(
                rollouts['states']).mean().item()
            metrics['kl'] = (old_log_probs - log_probs).mean()
            metrics['pg_norm'] = pg_norm
        return metrics

    ################################################################
    ########################## Utilities ###########################
    ################################################################
    def optimize_actor(self, new_parameters):
        vector_to_parameters(new_parameters, self.actor.parameters())

    def policy_gradient(self, rollouts):
        log_probs = self.actor.log_prob(rollouts['states'],
                                        rollouts['actions'])
        pg_objective = (log_probs * rollouts['advantages']).mean()
        pg_objective -= self.param.ENTROPY_COEFFICIENT * rollouts[
            'log_probs'].mean()
        return parameters_to_vector(
            torch.autograd.grad(pg_objective, self.actor.parameters()))

    def natural_gradient(self, pg, rollouts):
        def Hx(x):
            ''' Computes the Hessian-Vector product for the KL-Divergance '''
            d_kl = self.get_kl(self.actor, rollouts)
            grads = torch.autograd.grad(d_kl,
                                        self.actor.parameters(),
                                        create_graph=True)
            grads = parameters_to_vector(grads)
            Jx = torch.sum(grads * x)
            Hx = torch.autograd.grad(Jx, self.actor.parameters())
            Hx = parameters_to_vector(Hx)
            return Hx + self.param.CG_DAMPING * x

        stepdir = self.conjugate_gradient(Hx, pg, self.param.NUM_CG_ITER)
        stepsize = (2 * self.param.DELTA) / torch.dot(stepdir, Hx(stepdir))
        return torch.sqrt(stepsize) * stepdir

    def gae(self, rollouts):
        '''  Generaized Advantage Estimation '''
        states = torch.cat((rollouts.state, rollouts.next_state[-1:]))
        with torch.no_grad():
            values = self.critic(states).numpy()
        rewards = rollouts.reward.numpy()
        deltas = rewards + self.param.GAMMA * values[1:] - values[:-1]
        # rrlab magic discounting
        returns = scipy.signal.lfilter([1], [1, float(-self.param.GAMMA)],
                                       rewards[::-1],
                                       axis=0).astype('float32')
        advantages = scipy.signal.lfilter(
            [1], [1, float(-self.param.GAMMA * self.param.LAMBDA)],
            deltas[::-1],
            axis=0).astype('float32')
        return torch.flip(torch.from_numpy(advantages),
                          dims=[0]), torch.flip(torch.from_numpy(returns),
                                                dims=[0])

    def get_kl(self, model, rollouts):
        ''' Computes the KL-Divergance between the current policy and the model passed '''
        with torch.no_grad():
            p_old = self.actor.policy(rollouts['states'])
        p_new = model.policy(rollouts['states'])
        d_kl = kl_divergence(p_old, p_new).sum(dim=-1, keepdim=True).mean()
        return d_kl

    def conjugate_gradient(self, A, b, n):
        x = torch.zeros_like(b)
        r = b.clone()
        p = r.clone()
        rs = torch.dot(r, r)
        for i in range(n):
            if callable(A):
                Ap = A(p)
            else:
                Ap = torch.matmul(A, p)
            alpha = rs / torch.dot(p, Ap)
            x += alpha * p
            r -= alpha * Ap
            rs_next = torch.dot(r, r)
            betta = rs_next / rs
            p = r + betta * p
            rs = rs_next
            if rs < 1e-10:
                break
        return x

    def linesearch(self, npg, pg, rollouts):
        params_curr = parameters_to_vector(self.actor.parameters())
        for k in range(self.param.NUM_BACKTRACK):
            params_new = params_curr + self.param.ALPHA**k * npg
            model_new = deepcopy(self.actor)
            vector_to_parameters(params_new, model_new.parameters())
            param_diff = params_new - params_curr
            surr_loss = torch.dot(pg, param_diff)
            kl_div = self.get_kl(model_new, rollouts)
            if surr_loss >= 0 and kl_div <= self.param.DELTA:
                params_curr = params_new
                break
        return params_curr, (self.param.ALPHA**k * npg).norm()
Beispiel #11
0
def compare_cost(args):
    set_seed(args.seed)
    env = LQR(
        #N=20,
        #M=12,
        init_scale=1.0,
        max_steps=args.H,  # 10, 20
        Sigma_s_kappa=1.0,
        Q_kappa=1.0,
        P_kappa=1.0,
        A_norm=1.0,
        B_norm=1.0,
        Sigma_s_scale=0.0,
    )
    K = env.optimal_controller()
    mean_network = nn.Linear(*K.shape[::-1], bias=False)
    mean_network.weight.data = tensor(K)
    policy = GaussianPolicy(*K.shape[::-1],
                            mean_network,
                            learn_std=False,
                            gate_output=False)

    # mc
    mc_costs = []  # individual
    mc_means = []  # cumulative
    for i in tqdm(range(args.n_trajs), 'mc'):
        noises = np.random.randn(env.max_steps, env.M)
        _, _, rewards, _, _ = rollout(env, policy, noises)
        mc_costs.append(-rewards.sum())
        mc_means.append(np.mean(mc_costs))

    # rqmc
    rqmc_costs = []
    rqmc_means = []
    rqmc_noises = get_rqmc_noises(args.n_trajs, env.max_steps, env.M,
                                  'trajwise')
    for i in tqdm(range(args.n_trajs), 'rqmc'):
        _, _, rewards, _, _ = rollout(env, policy, rqmc_noises[i])
        rqmc_costs.append(-rewards.sum())
        rqmc_means.append(np.mean(rqmc_costs))

    # array rqmc
    arqmc_costs_dict = {}
    arqmc_means_dict = {}
    arqmc_noises = get_rqmc_noises(args.n_trajs, env.max_steps, env.M, 'ssj')
    #arqmc_noises = get_rqmc_noises(args.n_trajs, env.max_steps, env.M, 'array')

    for sorter in args.sorter:
        arqmc_costs = []
        arqmc_means = []
        sort_f = get_sorter(sorter, env)

        data = ArrayRQMCSampler(env, args.n_trajs,
                                sort_f=sort_f).sample(policy, arqmc_noises)
        for traj in data:
            rewards = np.asarray(traj['rewards'])
            arqmc_costs.append(-rewards.sum())
            arqmc_means.append(np.mean(arqmc_costs))
        arqmc_costs_dict[sorter] = arqmc_costs
        arqmc_means_dict[sorter] = arqmc_means

    expected_cost = env.expected_cost(K, np.diag(np.ones(env.M)))

    mc_errors = np.abs(mc_means - expected_cost)
    rqmc_errors = np.abs(rqmc_means - expected_cost)
    arqmc_errors_dict = {
        sorter: np.abs(arqmc_means - expected_cost)
        for sorter, arqmc_means in arqmc_means_dict.items()
    }
    logger.info('mc: {}, rqmc: {} '.format(mc_errors[-1], rqmc_errors[-1]) + \
        ' '.join(['arqmc ({}): {}'.format(sorter, arqmc_errors[-1]) for sorter, arqmc_errors in arqmc_errors_dict.items()]))
    info = {
        **vars(args), 'mc_costs': mc_costs,
        'rqmc_costs': rqmc_costs,
        'arqmc_costs': arqmc_costs
    }
    if args.save_fn is not None:
        with open(args.save_fn, 'wb') as f:
            dill.dump(
                dict(mc_errors=mc_errors,
                     rqmc_errors=rqmc_errors,
                     arqmc_errors_dict=arqmc_errors_dict,
                     info=info), f)
    if args.show_fig:
        data = pd.concat([
            pd.DataFrame({
                'name': 'mc',
                'x': np.arange(len(mc_errors)),
                'error': mc_errors,
            }),
            pd.DataFrame({
                'name': 'rqmc',
                'x': np.arange(len(rqmc_errors)),
                'error': rqmc_errors,
            }),
            pd.concat([
                pd.DataFrame({
                    'name': 'arqmc_{}'.format(sorter),
                    'x': np.arange(len(arqmc_errors)),
                    'error': arqmc_errors,
                }) for sorter, arqmc_errors in arqmc_errors_dict.items()
            ]),
        ])
        plot = sns.lineplot(x='x', y='error', hue='name', data=data)
        plot.set(yscale='log')
        plt.show()
    return mc_errors, rqmc_errors, arqmc_errors_dict, info
Beispiel #12
0
def compare_grad(args):
    set_seed(args.seed)
    env = LQR(
        N=args.xu_dim[0],
        M=args.xu_dim[1],
        lims=100,
        init_scale=1.0,
        max_steps=args.H,
        Sigma_s_kappa=1.0,
        Q_kappa=1.0,
        P_kappa=1.0,
        A_norm=1.0,
        B_norm=1.0,
        Sigma_s_scale=args.noise,
    )
    #K = env.optimal_controller()
    K = np.random.randn(env.M, env.N)
    mean_network = nn.Linear(*K.shape[::-1], bias=False)
    mean_network.weight.data = tensor(K)
    policy = GaussianPolicy(*K.shape[::-1],
                            mean_network,
                            learn_std=False,
                            gate_output=False)
    out_set = set()  # here

    Sigma_a = np.diag(np.ones(env.M))
    mc_grads = []
    for i in tqdm(range(args.n_trajs), 'mc'):
        noises = np.random.randn(env.max_steps, env.M)
        states, actions, rewards, _, _ = rollout(env, policy, noises)
        if len(states) < args.H:
            out_set.add('mc')
            break
        mc_grads.append(
            get_gaussian_policy_gradient(states, actions, rewards, policy,
                                         variance_reduced_loss))
    mc_grads = np.asarray(mc_grads)
    mc_means = np.cumsum(mc_grads, axis=0) / np.arange(
        1,
        len(mc_grads) + 1)[:, np.newaxis, np.newaxis]

    rqmc_grads = []
    #loc = torch.zeros(env.max_steps * env.M)
    #scale = torch.ones(env.max_steps * env.M)
    #rqmc_noises = Normal_RQMC(loc, scale).sample(torch.Size([args.n_trajs])).data.numpy()
    rqmc_noises = uniform2normal(
        random_shift(
            ssj_uniform(
                args.n_trajs,
                args.H * env.M,
            ).reshape(args.n_trajs, args.H, env.M),
            0,
        ))
    for i in tqdm(range(args.n_trajs), 'rqmc'):
        states, actions, rewards, _, _ = rollout(
            env, policy, rqmc_noises[i].reshape(env.max_steps, env.M))
        if len(states) < args.H:
            out_set.add('rqmc')
            break
        rqmc_grads.append(
            get_gaussian_policy_gradient(states, actions, rewards, policy,
                                         variance_reduced_loss))
    rqmc_grads = np.asarray(rqmc_grads)
    rqmc_means = np.cumsum(rqmc_grads, axis=0) / np.arange(
        1,
        len(rqmc_grads) + 1)[:, np.newaxis, np.newaxis]

    arqmc_means_dict = {}
    #arqmc_noises = get_rqmc_noises(args.n_trajs, args.H, env.M, 'array')
    uniform_noises = ssj_uniform(args.n_trajs, env.M)  # n_trajs , action_dim
    arqmc_noises = uniform2normal(
        random_shift(np.expand_dims(uniform_noises, 1).repeat(args.H, 1),
                     0))  # n_trajs, horizon, action_dim
    for sorter in args.sorter:
        arqmc_grads = []
        sort_f = get_sorter(sorter, env, K)
        data = ArrayRQMCSampler(env, args.n_trajs,
                                sort_f=sort_f).sample(policy, arqmc_noises)
        for traj in data:
            states, actions, rewards = np.asarray(traj['states']), np.asarray(
                traj['actions']), np.asarray(traj['rewards'])
            if len(states) < args.H:
                out_set.add('arqmc_{}'.format(sorter))
                break
            arqmc_grads.append(
                get_gaussian_policy_gradient(states, actions, rewards, policy,
                                             variance_reduced_loss))
        arqmc_grads = np.asarray(arqmc_grads)
        arqmc_means = np.cumsum(arqmc_grads, axis=0) / np.arange(
            1,
            len(arqmc_grads) + 1)[:, np.newaxis, np.newaxis]
        arqmc_means_dict[sorter] = arqmc_means

    expected_grad = env.expected_policy_gradient(K, Sigma_a)

    mc_errors = [np.nan] if 'mc' in out_set else ((
        mc_means - expected_grad)**2).reshape(mc_means.shape[0], -1).mean(
            1)  # why the sign is reversed?
    rqmc_errors = [np.nan] if 'rqmc' in out_set else (
        (rqmc_means -
         expected_grad)**2).reshape(rqmc_means.shape[0], -1).mean(1)
    arqmc_errors_dict = {
        sorter: [np.nan] if 'arqmc_{}'.format(sorter) in out_set else
        ((arqmc_means -
          expected_grad)**2).reshape(arqmc_means.shape[0], -1).mean(1)
        for sorter, arqmc_means in arqmc_means_dict.items()
    }
    info = {
        **vars(args),
        'out': out_set,
        'expected_grad': expected_grad,
        'means': {
            'mc': mc_means,
            'rqmc': rqmc_means,
            **arqmc_means_dict,
        },
    }
    if args.save_fn is not None:
        with open(save_fn, 'wb') as f:
            dill.dump(
                dict(mc_errors=mc_errors,
                     rqmc_errors=rqmc_errors,
                     arqmc_errors_dict=arqmc_errors_dict,
                     info=info), f)
    if args.show_fig:
        mc_data = pd.DataFrame({
            'name': 'mc',
            'x': np.arange(len(mc_errors)),
            'error': mc_errors,
        })
        rqmc_data = pd.DataFrame({
            'name': 'rqmc',
            'x': np.arange(len(rqmc_errors)),
            'error': rqmc_errors,
        })
        arqmc_data = pd.concat([
            pd.DataFrame({
                'name': 'arqmc_{}'.format(sorter),
                'x': np.arange(len(arqmc_errors)),
                'error': arqmc_errors,
            }) for sorter, arqmc_errors in arqmc_errors_dict.items()
        ])
        plot = sns.lineplot(x='x',
                            y='error',
                            hue='name',
                            data=pd.concat([mc_data, rqmc_data, arqmc_data]))
        plot.set(yscale='log')
        plt.show()
    return mc_errors, rqmc_errors, arqmc_errors_dict, info
class SACAgent:
    def __init__(self, env, gamma, tau, alpha, q_lr, policy_lr, a_lr,
                 buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.action_range = [env.action_space.low, env.action_space.high]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau

        # initialize networks
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.target_q_net1 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.target_q_net2 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.policy_net = GaussianPolicy(self.obs_dim,
                                         self.action_dim).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_q_net1.parameters(),
                                       self.q_net1.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_q_net2.parameters(),
                                       self.q_net2.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        # entropy temperature
        self.alpha = alpha
        self.target_entropy = -torch.prod(
            torch.Tensor(self.env.action_space.shape).to(self.device)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optim = optim.Adam([self.log_alpha], lr=a_lr)

        self.replay_buffer = Buffer(buffer_maxlen)

    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        mean, log_std = self.policy_net.forward(state)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)
        action = action.cpu().detach().squeeze(0).numpy()

        return action

    def update(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        _, _, next_zs, next_log_pi = self.policy_net.sample(next_states)
        next_actions = torch.tanh(next_zs)
        next_q1 = self.target_q_net1(next_states, next_actions)
        next_q2 = self.target_q_net2(next_states, next_actions)
        next_q_target = torch.min(next_q1, next_q2) - self.alpha * next_log_pi
        expected_q = rewards + (1 - dones) * self.gamma * next_q_target

        # q loss
        curr_q1 = self.q_net1.forward(states, actions)
        curr_q2 = self.q_net2.forward(states, actions)
        q1_loss = F.mse_loss(curr_q1, expected_q.detach())
        q2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # update q networks
        self.q1_optimizer.zero_grad()
        q1_loss.backward()
        self.q1_optimizer.step()

        self.q2_optimizer.zero_grad()
        q2_loss.backward()
        self.q2_optimizer.step()

        # delayed update for policy network and target q networks
        _, _, new_zs, log_pi = self.policy_net.sample(states)
        new_actions = torch.tanh(new_zs)
        min_q = torch.min(self.q_net1.forward(states, new_actions),
                          self.q_net2.forward(states, new_actions))
        policy_loss = (self.alpha * log_pi - min_q).mean()
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        # target networks
        for target_param, param in zip(self.target_q_net1.parameters(),
                                       self.q_net1.parameters()):
            target_param.data.copy_(self.tau * param +
                                    (1 - self.tau) * target_param)

        for target_param, param in zip(self.target_q_net2.parameters(),
                                       self.q_net2.parameters()):
            target_param.data.copy_(self.tau * param +
                                    (1 - self.tau) * target_param)

        # update temperature
        alpha_loss = (self.log_alpha *
                      (-log_pi - self.target_entropy).detach()).mean()

        self.alpha_optim.zero_grad()
        alpha_loss.backward()
        self.alpha_optim.step()
        self.alpha = self.log_alpha.exp()
class SAC(object):
    """Soft Actor-Critic algorithm

    [1] Haarnoja(2018), "Soft Actor-Critic: Off-Policy Maximum Entropy Deep
        Reinforcement Learning with a Stochastic Actor"
    """
    def __init__(
        self,
        env,
        policy=None,

        # Learning models
        nets_hidden_sizes=(64, 64),
        nets_nonlinear_op='relu',
        use_q2=True,
        explicit_vf=False,

        # RL algorithm behavior
        total_episodes=10,
        train_steps=100,
        eval_rollouts=10,
        max_horizon=100,
        fixed_horizon=True,

        # Target models update
        soft_target_tau=5e-3,
        target_update_interval=1,

        # Replay Buffer
        replay_buffer_size=1e6,
        batch_size=64,
        discount=0.99,

        # Optimization
        optimization_steps=1,
        optimizer='adam',
        optimizer_kwargs=None,
        policy_lr=3e-4,
        qf_lr=3e-4,
        policy_weight_decay=1.e-5,
        q_weight_decay=1.e-5,

        # Entropy
        entropy_scale=1.,
        auto_alpha=True,
        max_alpha=10,
        min_alpha=0.01,
        tgt_entro=None,

        # Others
        norm_input_pol=False,
        norm_input_vfs=False,
        seed=610,
        render=False,
        gpu_id=-1,
    ):
        """Soft Actor-Critic algorithm.
        Args:
            env (gym.Env):  OpenAI-Gym-like environment with multigoal option.
            policy (torch.nn.module): A pytorch stochastic Gaussian Policy
            nets_hidden_sizes (list or tuple of int): Number of units in hidden layers for all the networks.
            use_q2 (bool): Use two parameterized Q-functions.
            explicit_vf (bool):
            total_episodes (int):
            train_steps (int):
            eval_rollouts (int):
            max_horizon (int):
            fixed_horizon (bool):
            soft_target_tau (float):
            target_update_interval (int):
            replay_buffer_size (int):
            batch_size (int):
            discount (float):
            optimization_steps (int):
            optimizer (str):
            optimizer_kwargs (dict):
            policy_lr (float):
            qf_lr (float):
            policy_weight_decay (float):
            q_weight_decay (float):
            entropy_scale (float):
            auto_alpha (int):
            max_alpha (float):
            min_alpha (float):
            tgt_entro (float):
            norm_input_pol (bool):
            norm_input_vfs (bool):
            seed (int):
            render (bool):
            gpu_id (int):
        """
        self.seed = seed
        np.random.seed(seed)
        torch.cuda.manual_seed(seed)
        torch.manual_seed(seed)

        self.env = env
        self.env.seed(seed)

        # Algorithm hyperparameters
        self.obs_dim = np.prod(env.observation_space.shape).item()
        self.action_dim = np.prod(env.action_space.shape).item()
        self.total_episodes = total_episodes
        self.train_steps = train_steps
        self.eval_rollouts = eval_rollouts
        self.max_horizon = max_horizon
        self.fixed_horizon = fixed_horizon
        self.render = render

        self.discount = discount

        self.soft_target_tau = soft_target_tau
        self.target_update_interval = target_update_interval

        self.norm_input_pol = norm_input_pol
        self.norm_input_vfs = norm_input_vfs

        # Policy Network
        if policy is None:
            self.policy = GaussianPolicy(
                self.obs_dim,
                self.action_dim,
                nets_hidden_sizes,
                non_linear=nets_nonlinear_op,
                final_non_linear='linear',
                batch_norm=False,
                input_normalization=norm_input_pol,
            )
        else:
            self.policy = policy

        # Value Function Networks
        self.qf1 = QFunction(
            self.obs_dim,
            self.action_dim,
            nets_hidden_sizes,
            non_linear=nets_nonlinear_op,
            final_non_linear='linear',
            batch_norm=False,
            input_normalization=norm_input_vfs,
        )
        if use_q2:
            self.qf2 = QFunction(
                self.obs_dim,
                self.action_dim,
                nets_hidden_sizes,
                non_linear=nets_nonlinear_op,
                final_non_linear='linear',
                batch_norm=False,
                input_normalization=norm_input_vfs,
            )
        else:
            self.qf2 = None

        if explicit_vf:
            self.vf = VFunction(
                self.obs_dim,
                nets_hidden_sizes,
                non_linear=nets_nonlinear_op,
                final_non_linear='linear',
                batch_norm=False,
                input_normalization=norm_input_vfs,
            )
            self.target_vf = VFunction(
                self.obs_dim,
                nets_hidden_sizes,
                non_linear=nets_nonlinear_op,
                final_non_linear='linear',
                batch_norm=False,
                input_normalization=norm_input_vfs,
            )
            self.target_vf.load_state_dict(self.vf.state_dict())
            self.target_vf.eval()
            self.target_qf1 = None
            self.target_qf2 = None
        else:
            self.vf = None
            self.target_vf = None
            self.target_qf1 = QFunction(
                self.obs_dim,
                self.action_dim,
                nets_hidden_sizes,
                non_linear=nets_nonlinear_op,
                final_non_linear='linear',
                batch_norm=False,
                input_normalization=norm_input_vfs,
            )
            self.target_qf1.load_state_dict(self.qf1.state_dict())
            self.target_qf1.eval()
            if use_q2:
                self.target_qf2 = QFunction(
                    self.obs_dim,
                    self.action_dim,
                    nets_hidden_sizes,
                    non_linear=nets_nonlinear_op,
                    final_non_linear='linear',
                    batch_norm=False,
                    input_normalization=norm_input_vfs,
                )
                self.target_qf2.load_state_dict(self.qf2.state_dict())
                self.target_qf2.eval()
            else:
                self.target_qf2 = None

        # Replay Buffer
        self.replay_buffer = ReplayBuffer(
            max_size=int(replay_buffer_size),
            obs_dim=self.obs_dim,
            action_dim=self.action_dim,
        )
        self.batch_size = batch_size

        # Move models to GPU
        self.torch_device = \
            torch.device("cuda:" + str(gpu_id) if gpu_id >= 0 else "cpu")

        for model in self.trainable_models + self.non_trainable_models:
            model.to(device=self.torch_device)

        # Ensure non trainable models have fixed parameters
        for model in self.non_trainable_models:
            model.eval()
            # # TODO: Should we also set its parameters to requires_grad=False?
            # for param in model.parameters():
            #     param.requires_grad = False

        # ###### #
        # Alphas #
        # ###### #
        self.entropy_scale = torch.tensor(entropy_scale,
                                          device=self.torch_device)
        if tgt_entro is None:
            tgt_entro = -self.action_dim
        self.tgt_entro = torch.tensor(tgt_entro, device=self.torch_device)
        self._auto_alpha = auto_alpha
        self.max_alpha = max_alpha
        self.min_alpha = min_alpha
        self.log_alpha = torch.zeros(1,
                                     device=self.torch_device,
                                     requires_grad=True)

        # ########## #
        # Optimizers #
        # ########## #
        self.optimization_steps = optimization_steps
        if optimizer.lower() == 'adam':
            optimizer_class = torch.optim.Adam
            if optimizer_kwargs is None:
                optimizer_kwargs = dict(amsgrad=True,
                                        # amsgrad=False,
                                        )
        elif optimizer.lower() == 'rmsprop':
            optimizer_class = torch.optim.RMSprop
            if optimizer_kwargs is None:
                optimizer_kwargs = dict()
        else:
            raise ValueError('Wrong optimizer')

        # Values optimizer
        qvals_params = self.qf1.parameters()
        if self.qf2 is not None:
            qvals_params = chain(qvals_params, self.qf2.parameters())
        self.qvalues_optimizer = optimizer_class(qvals_params,
                                                 lr=qf_lr,
                                                 weight_decay=q_weight_decay,
                                                 **optimizer_kwargs)
        if self.vf is not None:
            self.vvalues_optimizer = optimizer_class(
                self.vf.parameters(),
                lr=qf_lr,
                weight_decay=q_weight_decay,
                **optimizer_kwargs)
        else:
            self.vvalues_optimizer = None

        # Policy optimizer
        self._policy_optimizer = optimizer_class(
            self.policy.parameters(),
            lr=policy_lr,
            weight_decay=policy_weight_decay,
            **optimizer_kwargs)

        # Alpha optimizers
        self._alphas_optimizer = optimizer_class([self.log_alpha],
                                                 lr=policy_lr,
                                                 **optimizer_kwargs)

        # Internal variables
        self.num_train_interactions = 0
        self.num_train_steps = 0
        self.num_eval_interactions = 0
        self.num_episodes = 0

        # Log variables
        self.logging_qvalues_error = 0
        self.logging_vvalues_error = 0
        self.logging_policies_error = 0
        self.logging_entropy = torch.zeros(self.batch_size)
        self.logging_mean = torch.zeros((self.batch_size, self.action_dim))
        self.logging_std = torch.zeros((self.batch_size, self.action_dim))
        self.logging_eval_rewards = torch.zeros(self.eval_rollouts)
        self.logging_eval_returns = torch.zeros(self.eval_rollouts)

    @property
    def trainable_models(self):
        models = [self.policy, self.qf1]
        if self.qf2 is not None:
            models.append(self.qf2)

        if self.vf is not None:
            models.append(self.vf)

        return models

    @property
    def non_trainable_models(self):
        models = [self.target_qf1]
        if self.target_qf2 is not None:
            models.append(self.target_qf2)
        if self.target_vf is not None:
            models.append(self.target_vf)
        return models

    def train(self, init_episode=0):

        if init_episode == 0:
            # Eval and log
            self.eval()
            self.log(write_table_header=True)

        gt.reset()
        gt.set_def_unique(False)

        expected_accum_rewards = np.zeros(self.total_episodes)

        episodes_iter = range(init_episode, self.total_episodes)
        if not logger.get_log_stdout():
            # Fancy iterable bar
            episodes_iter = tqdm.tqdm(episodes_iter)

        for it in gt.timed_for(episodes_iter, save_itrs=True):
            # Put models in training mode
            for model in self.trainable_models:
                model.train()

            obs = self.env.reset()
            rollout_steps = 0
            for step in range(self.train_steps):
                if self.render:
                    self.env.render()
                interaction_info = interaction(
                    self.env,
                    self.policy,
                    obs,
                    device=self.torch_device,
                    deterministic=False,
                )
                self.num_train_interactions += 1
                rollout_steps += 1
                gt.stamp('sample')

                # Add data to replay_buffer
                self.replay_buffer.add_sample(**interaction_info)

                # Only train when there are enough samples from buffer
                if self.replay_buffer.available_samples() > self.batch_size:
                    for ii in range(self.optimization_steps):
                        self.learn()
                gt.stamp('train')

                # Reset environment if it is done
                if interaction_info['termination'] \
                        or rollout_steps > self.max_horizon:
                    obs = self.env.reset()
                    rollout_steps = 0
                else:
                    obs = interaction_info['next_obs']

            # Evaluate current policy to check performance
            expected_accum_rewards[it] = self.eval()

            self.log()

            self.num_episodes += 1

        return expected_accum_rewards

    def eval(self):
        """Evaluate deterministically the Gaussian policy.

        Returns:
            np.array: Expected accumulated reward

        """
        # Put models in evaluation mode
        for model in self.trainable_models:
            model.eval()

        for rr in range(self.eval_rollouts):
            rollout_info = rollout(
                self.env,
                self.policy,
                max_horizon=self.max_horizon,
                fixed_horizon=self.fixed_horizon,
                render=self.render,
                return_info_dict=True,
                device=self.torch_device,
                deterministic=True,
            )

            self.logging_eval_rewards[rr] = torch.tensor(
                rollout_info['reward']).mean()
            self.logging_eval_returns[rr] = torch.tensor(
                rollout_info['reward']).sum()

            self.num_eval_interactions += 1

        gt.stamp('eval')

        return self.logging_eval_returns.mean().item()

    def learn(self):
        """Improve the Gaussian policy with the Soft Actor-Critic algorithm.

        Returns:
            None

        """
        # Get batch from the replay buffer
        batch = self.replay_buffer.random_batch(self.batch_size,
                                                device=self.torch_device)
        # Get common data from batch
        obs = batch['observations']
        actions = batch['actions']
        next_obs = batch['next_observations']
        rewards = batch['rewards']
        terminations = batch['terminations']

        policy_prior_log_prob = 0.0  # Uniform prior  # TODO: Normal prior

        # Alphas
        alpha = self.entropy_scale * self.log_alpha.exp()

        # Actions for batch observation
        new_actions, policy_info = self.policy(obs,
                                               deterministic=False,
                                               return_log_prob=True)
        new_log_pi = policy_info['log_prob']
        new_mean = policy_info['mean']
        new_std = policy_info['std']

        # Actions for batch next_observation
        with torch.no_grad():
            next_actions, policy_info = self.policy(next_obs,
                                                    deterministic=False,
                                                    return_log_prob=True)
            next_log_pi = policy_info['log_prob']

        # ###################### #
        # Policy Evaluation Step #
        # ###################### #

        if self.target_vf is None:
            with torch.no_grad():
                # Estimate from target Q-value(s)
                # Q1_target(s', a')
                next_q1 = self.target_qf1(next_obs, next_actions)

                if self.target_qf2 is not None:
                    # Q2_target(s', a')
                    next_q2 = self.target_qf2(next_obs, next_actions)

                    # Minimum Unintentional Double-Q
                    next_q = torch.min(next_q1, next_q2)
                else:
                    next_q = next_q1

                # Vtarget(s')
                next_v = next_q - alpha * next_log_pi
        else:
            with torch.no_grad():
                # Vtarget(s')
                next_v = self.target_vf(next_obs)

        # Calculate Bellman Backup for Q-values
        q_backup = rewards + (1. - terminations) * self.discount * next_v

        # Prediction Q(s,a)
        q1_pred = self.qf1(obs, actions)
        # Critic loss: Mean Squared Bellman Error (MSBE)
        qf1_loss = \
            0.5 * torch.mean((q1_pred - q_backup) ** 2, dim=0).squeeze(-1)

        if self.qf2 is not None:
            q2_pred = self.qf2(obs, actions)
            # Critic loss: Mean Squared Bellman Error (MSBE)
            qf2_loss = \
                0.5 * torch.mean((q2_pred - q_backup)**2, dim=0).squeeze(-1)
        else:
            qf2_loss = 0

        self.qvalues_optimizer.zero_grad()
        qvalues_loss = qf1_loss + qf2_loss
        qvalues_loss.backward()
        self.qvalues_optimizer.step()

        # ####################### #
        # Policy Improvement Step #
        # ####################### #

        # TODO: Decide if use the minimum btw q1 and q2. Using new_q1 for now
        new_q1 = self.qf1(obs, new_actions)
        new_q = new_q1

        # Policy KL loss: - (E_a[Q(s, a) + H(.)])
        policy_kl_loss = -torch.mean(
            new_q - alpha * new_log_pi + policy_prior_log_prob, dim=0)
        policy_regu_loss = 0  # TODO: It can include regularization of mean, std
        policy_loss = torch.sum(policy_kl_loss + policy_regu_loss)

        # Update both Intentional and Unintentional Policies at the same time
        self._policy_optimizer.zero_grad()
        policy_loss.backward()
        self._policy_optimizer.step()

        # ################################# #
        # (Optional) V-fcn improvement step #
        # ################################# #
        if self.vf is not None:
            v_pred = self.vf(obs)
            # Calculate Bellman Backup for Q-values
            v_backup = new_q - alpha * new_log_pi + policy_prior_log_prob
            v_backup.detach_()

            # Critic loss: Mean Squared Bellman Error (MSBE)
            vf_loss = \
                0.5 * torch.mean((v_pred - v_backup)**2, dim=0).squeeze(-1)
            self.vvalues_optimizer.zero_grad()
            vvalues_loss = vf_loss
            vvalues_loss.backward()
            self.vvalues_optimizer.step()

        # ####################### #
        # Entropy Adjustment Step #
        # ####################### #
        if self._auto_alpha:
            # NOTE: In formula is alphas and not log_alphas
            alphas_loss = -(
                self.log_alpha *
                (new_log_pi.squeeze(-1) + self.tgt_entro).mean(dim=0).detach())
            hiu_alphas_loss = alphas_loss.sum()
            self._alphas_optimizer.zero_grad()
            hiu_alphas_loss.backward()
            self._alphas_optimizer.step()
            self.log_alpha.data.clamp_(min=math.log(self.min_alpha),
                                       max=math.log(self.max_alpha))

        # ########################### #
        # Target Networks Update Step #
        # ########################### #
        if self.num_train_steps % self.target_update_interval == 0:
            if self.target_vf is None:
                soft_param_update_from_to(source=self.qf1,
                                          target=self.target_qf1,
                                          tau=self.soft_target_tau)
                if self.target_qf2 is not None:
                    soft_param_update_from_to(source=self.qf2,
                                              target=self.target_qf2,
                                              tau=self.soft_target_tau)
            else:
                soft_param_update_from_to(source=self.vf,
                                          target=self.target_vf,
                                          tau=self.soft_target_tau)
        # Always hard_update of input normalizer (if active)
        if self.norm_input_vfs:
            if self.target_vf is None:
                hard_buffer_update_from_to(
                    source=self.qf1,
                    target=self.target_qf1,
                )
                if self.target_qf2 is not None:
                    hard_buffer_update_from_to(
                        source=self.qf2,
                        target=self.target_qf2,
                    )
            else:
                hard_buffer_update_from_to(
                    source=self.vf,
                    target=self.target_vf,
                )

        # Increase internal counter
        self.num_train_steps += 1

        # ######## #
        # Log data #
        # ######## #
        self.logging_policies_error = policy_loss.item()
        self.logging_qvalues_error = qvalues_loss.item()
        self.logging_vvalues_error = vvalues_loss.item() \
            if self.target_vf is not None else 0.
        self.logging_entropy.data.copy_(-new_log_pi.squeeze(dim=-1).data)
        self.logging_mean.data.copy_(new_mean.data)
        self.logging_std.data.copy_(new_std.data)

    def save_training_state(self):
        """Save models

        Returns:
            None

        """
        models_dict = {
            'policy': self.policy,
            'qf1': self.qf1,
            'qf2': self.qf2,
            'target_qf1': self.target_qf1,
            'target_qf2': self.target_qf2,
            'vf': self.vf,
        }
        replaceable_models_dict = {
            'replay_buffer',
            self.replay_buffer,
        }
        logger.save_torch_models(self.num_episodes, models_dict,
                                 replaceable_models_dict)

    def load_training_state(self):
        pass

    def log(self, write_table_header=False):
        logger.log("Logging data in directory: %s" % logger.get_snapshot_dir())

        logger.record_tabular("Episode", self.num_episodes)

        logger.record_tabular("Accumulated Training Steps",
                              self.num_train_interactions)

        logger.record_tabular("Policy Error", self.logging_policies_error)
        logger.record_tabular("Q-Value Error", self.logging_qvalues_error)
        logger.record_tabular("V-Value Error", self.logging_vvalues_error)

        logger.record_tabular("Alpha", np_ify(self.log_alpha.exp()).item())
        logger.record_tabular("Entropy",
                              np_ify(self.logging_entropy.mean(dim=(0, ))))

        act_mean = np_ify(self.logging_mean.mean(dim=(0, )))
        act_std = np_ify(self.logging_std.mean(dim=(0, )))
        for aa in range(self.action_dim):
            logger.record_tabular("Mean Action %02d" % aa, act_mean[aa])
            logger.record_tabular("Std Action %02d" % aa, act_std[aa])

        # Evaluation Stats to plot
        logger.record_tabular("Test Rewards Mean",
                              np_ify(self.logging_eval_rewards.mean()))
        logger.record_tabular("Test Rewards Std",
                              np_ify(self.logging_eval_rewards.std()))
        logger.record_tabular("Test Returns Mean",
                              np_ify(self.logging_eval_returns.mean()))
        logger.record_tabular("Test Returns Std",
                              np_ify(self.logging_eval_returns.std()))

        # Add the previous times to the logger
        times_itrs = gt.get_times().stamps.itrs
        train_time = times_itrs.get('train', [0])[-1]
        sample_time = times_itrs.get('sample', [0])[-1]
        eval_time = times_itrs.get('eval', [0])[-1]
        epoch_time = train_time + sample_time + eval_time
        total_time = gt.get_times().total
        logger.record_tabular('Train Time (s)', train_time)
        logger.record_tabular('(Previous) Eval Time (s)', eval_time)
        logger.record_tabular('Sample Time (s)', sample_time)
        logger.record_tabular('Epoch Time (s)', epoch_time)
        logger.record_tabular('Total Train Time (s)', total_time)

        # Dump the logger data
        logger.dump_tabular(with_prefix=False,
                            with_timestamp=False,
                            write_header=write_table_header)
        # Save pytorch models
        self.save_training_state()
        logger.log("----")
Beispiel #15
0
class PPO(BaseRL, OnPolicy):
    def __init__(self, env, param=None):
        super(PPO, self).__init__(env, param=param)
        self.name = 'PPO'
        self.critic = ValueFunction(self.param.value , self.device)
        self.actor = GaussianPolicy(self.param.policy, self.device)
        self.steps = 0
        self.episode_steps = 0

        if self.param.LR_SCHEDULE:
            schedule = lambda epoch: 1 - epoch/(self.param.evaluation['total_timesteps'] // self.param.BATCH_SIZE)
        else:
            schedule = lambda epoch: 1
        self.actor_scheduler = optim.lr_scheduler.LambdaLR(self.actor.optimizer, schedule)
        self.critic_scheduler = optim.lr_scheduler.LambdaLR(self.critic.optimizer, schedule)

    def act(self, state, deterministic=False):
        self.steps += 1
        with torch.no_grad():
            s = torch.from_numpy(state).float().to(self.device)
            if self.steps < self.param.DELAYED_START:
                action = self.env.action_space.sample()
            else:
                self.actor.eval()
                action = self.actor(s, deterministic=deterministic).cpu().numpy() 
            a = torch.from_numpy(action).float().to(self.device)
            next_state, reward, done, _ = self.env.step(action)
           
            if not deterministic:
                done_bool = float(done)
                self.critic.eval()
                s_ = np.stack([state, next_state])
                s_ = torch.from_numpy(s_).float().to(self.device)
                value, next_value = self.critic(s_)
                log_pi = self.actor.log_prob(s, a)
                self.memory.store(state, action, reward, next_state, done_bool, value, next_value, log_pi)
                if done:
                    self.memory.process_episode(maximum_entropy=self.param.MAX_ENTROPY) 
        return next_state, reward, done
    
    @OnPolicy.loop
    def learn(self):
        pg_norm = 0
        rollouts = self.onPolicyData
        if self.param.ADVANTAGE_NORMALIZATION:
            rollouts['advantages'] = (rollouts['advantages'] - rollouts['advantages'].mean()) / (rollouts['advantages'].std() + 1e-5) 

        for _ in range(self.param.EPOCHS):
            generator = self.data_generator(rollouts)
            for mini_batch in generator:
                s, a, returns, old_values, old_log_probs, advantages = mini_batch
                # Critic Step
                self.critic.train()
                values = self.critic(s)
                if self.param.CLIPPED_VALUE:
                    critic_loss = self.clipped_value_loss(old_val, values, returns)
                else:
                    critic_loss = F.mse_loss(values, returns)
                self.critic.optimize(critic_loss)
                # Actor Step
                self.actor.train()
                log_probs = self.actor.log_prob(s,a)
                kl_div = (old_log_probs-log_probs).mean()   
                # Early Stopping            
                if self.param.EARLY_STOPPING and kl_div > 2 * self.param.MAX_KL_DIV:
                    # print('Early stopping at epoch {} due to reaching max kl.'.format(i))
                    break
                actor_loss = self.clipped_policy_objective(old_log_probs, log_probs, advantages)
                actor_loss -= self.param.ENTROPY_COEFFICIENT * log_probs.mean()
                actor_loss += self.param.CUTOFF_COEFFICIENT * (kl_div > 2 * self.param.MAX_KL_DIV) * (kl_div - self.param.MAX_KL_DIV)**2
                pg_norm += self.actor.optimize(actor_loss)
        self.critic_scheduler.step()
        self.actor_scheduler.step()
        metrics = dict()
        with torch.no_grad():
            metrics['explained_variance'] = (1 - (rollouts['returns_mc'] - rollouts['values']).pow(2).sum()/(rollouts['returns_mc']-rollouts['returns_mc'].mean() + 1e-5).pow(2).sum()).item()
            metrics['entropy'] = self.actor.entropy(rollouts['states']).mean().item()
            metrics['kl'] = kl_div.item()
            metrics['pg_norm'] = pg_norm
        return metrics

    ################################################################
    ########################## Utilities ###########################
    ################################################################

    def clipped_policy_objective(self, old_log_pi, log_pi, adv):
        ratio = torch.exp(log_pi - old_log_pi)
        loss = ratio * adv
        clipped_loss = torch.clamp(ratio, 1 - self.param.CLIP,  1 + self.param.CLIP) * adv
        return -torch.min(loss, clipped_loss).mean()

    def clipped_value_loss(self, old_val, val, ret):
        loss = (val - ret).pow(2)
        clipped_loss = ((old_val + torch.clamp(val - old_val, -self.param.CLIP, self.param.CLIP)) - ret).pow(2)
        return torch.max(loss, clipped_loss).mean()

    def data_generator(self, rollouts):
        if self.param.NUM_MINI_BATCHES > 0:
            mini_batch_size = self.param.BATCH_SIZE // self.param.NUM_MINI_BATCHES
            random_sampler = SubsetRandomSampler(range(self.param.BATCH_SIZE))
            batch_sampler = BatchSampler(random_sampler, mini_batch_size, drop_last=True)
            for indices in batch_sampler:
                s = rollouts['states'][indices]
                a = rollouts['actions'][indices]
                ret = rollouts['returns_gae'][indices]
                val = rollouts['values'][indices]
                pi = rollouts['log_probs'][indices]
                adv = rollouts['advantages'][indices]
                yield s, a, ret, val, pi, adv
        else:
            s = rollouts['states']
            a = rollouts['actions']
            ret = rollouts['returns_gae']
            val = rollouts['values']
            pi = rollouts['log_probs']
            adv = rollouts['advantages']
            yield s, a, ret, val, pi, adv
    def __init__(
        self,
        env,
        policy=None,

        # Learning models
        nets_hidden_sizes=(64, 64),
        nets_nonlinear_op='relu',
        use_q2=True,
        explicit_vf=False,

        # RL algorithm behavior
        total_episodes=10,
        train_steps=100,
        eval_rollouts=10,
        max_horizon=100,
        fixed_horizon=True,

        # Target models update
        soft_target_tau=5e-3,
        target_update_interval=1,

        # Replay Buffer
        replay_buffer_size=1e6,
        batch_size=64,
        discount=0.99,

        # Optimization
        optimization_steps=1,
        optimizer='adam',
        optimizer_kwargs=None,
        policy_lr=3e-4,
        qf_lr=3e-4,
        policy_weight_decay=1.e-5,
        q_weight_decay=1.e-5,

        # Entropy
        entropy_scale=1.,
        auto_alpha=True,
        max_alpha=10,
        min_alpha=0.01,
        tgt_entro=None,

        # Others
        norm_input_pol=False,
        norm_input_vfs=False,
        seed=610,
        render=False,
        gpu_id=-1,
    ):
        """Soft Actor-Critic algorithm.
        Args:
            env (gym.Env):  OpenAI-Gym-like environment with multigoal option.
            policy (torch.nn.module): A pytorch stochastic Gaussian Policy
            nets_hidden_sizes (list or tuple of int): Number of units in hidden layers for all the networks.
            use_q2 (bool): Use two parameterized Q-functions.
            explicit_vf (bool):
            total_episodes (int):
            train_steps (int):
            eval_rollouts (int):
            max_horizon (int):
            fixed_horizon (bool):
            soft_target_tau (float):
            target_update_interval (int):
            replay_buffer_size (int):
            batch_size (int):
            discount (float):
            optimization_steps (int):
            optimizer (str):
            optimizer_kwargs (dict):
            policy_lr (float):
            qf_lr (float):
            policy_weight_decay (float):
            q_weight_decay (float):
            entropy_scale (float):
            auto_alpha (int):
            max_alpha (float):
            min_alpha (float):
            tgt_entro (float):
            norm_input_pol (bool):
            norm_input_vfs (bool):
            seed (int):
            render (bool):
            gpu_id (int):
        """
        self.seed = seed
        np.random.seed(seed)
        torch.cuda.manual_seed(seed)
        torch.manual_seed(seed)

        self.env = env
        self.env.seed(seed)

        # Algorithm hyperparameters
        self.obs_dim = np.prod(env.observation_space.shape).item()
        self.action_dim = np.prod(env.action_space.shape).item()
        self.total_episodes = total_episodes
        self.train_steps = train_steps
        self.eval_rollouts = eval_rollouts
        self.max_horizon = max_horizon
        self.fixed_horizon = fixed_horizon
        self.render = render

        self.discount = discount

        self.soft_target_tau = soft_target_tau
        self.target_update_interval = target_update_interval

        self.norm_input_pol = norm_input_pol
        self.norm_input_vfs = norm_input_vfs

        # Policy Network
        if policy is None:
            self.policy = GaussianPolicy(
                self.obs_dim,
                self.action_dim,
                nets_hidden_sizes,
                non_linear=nets_nonlinear_op,
                final_non_linear='linear',
                batch_norm=False,
                input_normalization=norm_input_pol,
            )
        else:
            self.policy = policy

        # Value Function Networks
        self.qf1 = QFunction(
            self.obs_dim,
            self.action_dim,
            nets_hidden_sizes,
            non_linear=nets_nonlinear_op,
            final_non_linear='linear',
            batch_norm=False,
            input_normalization=norm_input_vfs,
        )
        if use_q2:
            self.qf2 = QFunction(
                self.obs_dim,
                self.action_dim,
                nets_hidden_sizes,
                non_linear=nets_nonlinear_op,
                final_non_linear='linear',
                batch_norm=False,
                input_normalization=norm_input_vfs,
            )
        else:
            self.qf2 = None

        if explicit_vf:
            self.vf = VFunction(
                self.obs_dim,
                nets_hidden_sizes,
                non_linear=nets_nonlinear_op,
                final_non_linear='linear',
                batch_norm=False,
                input_normalization=norm_input_vfs,
            )
            self.target_vf = VFunction(
                self.obs_dim,
                nets_hidden_sizes,
                non_linear=nets_nonlinear_op,
                final_non_linear='linear',
                batch_norm=False,
                input_normalization=norm_input_vfs,
            )
            self.target_vf.load_state_dict(self.vf.state_dict())
            self.target_vf.eval()
            self.target_qf1 = None
            self.target_qf2 = None
        else:
            self.vf = None
            self.target_vf = None
            self.target_qf1 = QFunction(
                self.obs_dim,
                self.action_dim,
                nets_hidden_sizes,
                non_linear=nets_nonlinear_op,
                final_non_linear='linear',
                batch_norm=False,
                input_normalization=norm_input_vfs,
            )
            self.target_qf1.load_state_dict(self.qf1.state_dict())
            self.target_qf1.eval()
            if use_q2:
                self.target_qf2 = QFunction(
                    self.obs_dim,
                    self.action_dim,
                    nets_hidden_sizes,
                    non_linear=nets_nonlinear_op,
                    final_non_linear='linear',
                    batch_norm=False,
                    input_normalization=norm_input_vfs,
                )
                self.target_qf2.load_state_dict(self.qf2.state_dict())
                self.target_qf2.eval()
            else:
                self.target_qf2 = None

        # Replay Buffer
        self.replay_buffer = ReplayBuffer(
            max_size=int(replay_buffer_size),
            obs_dim=self.obs_dim,
            action_dim=self.action_dim,
        )
        self.batch_size = batch_size

        # Move models to GPU
        self.torch_device = \
            torch.device("cuda:" + str(gpu_id) if gpu_id >= 0 else "cpu")

        for model in self.trainable_models + self.non_trainable_models:
            model.to(device=self.torch_device)

        # Ensure non trainable models have fixed parameters
        for model in self.non_trainable_models:
            model.eval()
            # # TODO: Should we also set its parameters to requires_grad=False?
            # for param in model.parameters():
            #     param.requires_grad = False

        # ###### #
        # Alphas #
        # ###### #
        self.entropy_scale = torch.tensor(entropy_scale,
                                          device=self.torch_device)
        if tgt_entro is None:
            tgt_entro = -self.action_dim
        self.tgt_entro = torch.tensor(tgt_entro, device=self.torch_device)
        self._auto_alpha = auto_alpha
        self.max_alpha = max_alpha
        self.min_alpha = min_alpha
        self.log_alpha = torch.zeros(1,
                                     device=self.torch_device,
                                     requires_grad=True)

        # ########## #
        # Optimizers #
        # ########## #
        self.optimization_steps = optimization_steps
        if optimizer.lower() == 'adam':
            optimizer_class = torch.optim.Adam
            if optimizer_kwargs is None:
                optimizer_kwargs = dict(amsgrad=True,
                                        # amsgrad=False,
                                        )
        elif optimizer.lower() == 'rmsprop':
            optimizer_class = torch.optim.RMSprop
            if optimizer_kwargs is None:
                optimizer_kwargs = dict()
        else:
            raise ValueError('Wrong optimizer')

        # Values optimizer
        qvals_params = self.qf1.parameters()
        if self.qf2 is not None:
            qvals_params = chain(qvals_params, self.qf2.parameters())
        self.qvalues_optimizer = optimizer_class(qvals_params,
                                                 lr=qf_lr,
                                                 weight_decay=q_weight_decay,
                                                 **optimizer_kwargs)
        if self.vf is not None:
            self.vvalues_optimizer = optimizer_class(
                self.vf.parameters(),
                lr=qf_lr,
                weight_decay=q_weight_decay,
                **optimizer_kwargs)
        else:
            self.vvalues_optimizer = None

        # Policy optimizer
        self._policy_optimizer = optimizer_class(
            self.policy.parameters(),
            lr=policy_lr,
            weight_decay=policy_weight_decay,
            **optimizer_kwargs)

        # Alpha optimizers
        self._alphas_optimizer = optimizer_class([self.log_alpha],
                                                 lr=policy_lr,
                                                 **optimizer_kwargs)

        # Internal variables
        self.num_train_interactions = 0
        self.num_train_steps = 0
        self.num_eval_interactions = 0
        self.num_episodes = 0

        # Log variables
        self.logging_qvalues_error = 0
        self.logging_vvalues_error = 0
        self.logging_policies_error = 0
        self.logging_entropy = torch.zeros(self.batch_size)
        self.logging_mean = torch.zeros((self.batch_size, self.action_dim))
        self.logging_std = torch.zeros((self.batch_size, self.action_dim))
        self.logging_eval_rewards = torch.zeros(self.eval_rollouts)
        self.logging_eval_returns = torch.zeros(self.eval_rollouts)
class SAC:

    MAX_EXPERIENCES = 100000

    MIN_EXPERIENCES = 512

    UPDATE_PERIOD = 4

    GAMMA = 0.99

    TAU = 0.005

    BATCH_SIZE = 256

    def __init__(self, env_id, action_space, action_bound):

        self.env_id = env_id

        self.action_space = action_space

        self.action_bound = action_bound

        self.env = gym.make(self.env_id)

        self.replay_buffer = ReplayBuffer(max_len=self.MAX_EXPERIENCES)

        self.policy = GaussianPolicy(action_space=self.action_space,
                                     action_bound=self.action_bound)

        self.duqlqnet = DualQNetwork()

        self.target_dualqnet = DualQNetwork()

        self.log_alpha = tf.Variable(0.)  #: alpha=1

        self.alpha_optimizer = tf.keras.optimizers.Adam(3e-4)

        self.target_entropy = -0.5 * self.action_space

        self.global_steps = 0

        self._initialize_weights()

    def _initialize_weights(self):
        """1度callすることでネットワークの重みを初期化
        """

        env = gym.make(self.env_id)

        dummy_state = env.reset()
        dummy_state = (dummy_state[np.newaxis, ...]).astype(np.float32)

        dummy_action = np.random.normal(0, 0.1, size=self.action_space)
        dummy_action = (dummy_action[np.newaxis, ...]).astype(np.float32)

        self.policy(dummy_state)

        self.duqlqnet(dummy_state, dummy_action)
        self.target_dualqnet(dummy_state, dummy_action)
        self.target_dualqnet.set_weights(self.duqlqnet.get_weights())

    def play_episode(self):

        episode_reward = 0

        local_steps = 0

        done = False

        state = self.env.reset()

        while not done:

            action, _ = self.policy.sample_action(np.atleast_2d(state))

            action = action.numpy()[0]

            next_state, reward, done, _ = self.env.step(action)

            exp = Experience(state, action, reward, next_state, done)

            self.replay_buffer.push(exp)

            state = next_state

            episode_reward += reward

            local_steps += 1

            self.global_steps += 1

            if (len(self.replay_buffer) >= self.MIN_EXPERIENCES
               and self.global_steps % self.UPDATE_PERIOD == 0):

                self.update_networks()

        return episode_reward, local_steps, tf.exp(self.log_alpha)

    def update_networks(self):

        (states, actions, rewards,
         next_states, dones) = self.replay_buffer.get_minibatch(self.BATCH_SIZE)

        alpha = tf.math.exp(self.log_alpha)

        #: Update Q-function
        next_actions, next_logprobs = self.policy.sample_action(next_states)

        target_q1, target_q2 = self.target_dualqnet(next_states, next_actions)

        target = rewards + (1 - dones) * self.GAMMA * (
            tf.minimum(target_q1, target_q2) + -1 * alpha * next_logprobs
            )

        with tf.GradientTape() as tape:
            q1, q2 = self.duqlqnet(states, actions)
            loss_1 = tf.reduce_mean(tf.square(target - q1))
            loss_2 = tf.reduce_mean(tf.square(target - q2))
            loss = 0.5 * loss_1 + 0.5 * loss_2

        variables = self.duqlqnet.trainable_variables
        grads = tape.gradient(loss, variables)
        self.duqlqnet.optimizer.apply_gradients(zip(grads, variables))

        #: Update policy
        with tf.GradientTape() as tape:
            selected_actions, logprobs = self.policy.sample_action(states)
            q1, q2 = self.duqlqnet(states, selected_actions)
            q_min = tf.minimum(q1, q2)
            loss = -1 * tf.reduce_mean(q_min + -1 * alpha * logprobs)

        variables = self.policy.trainable_variables
        grads = tape.gradient(loss, variables)
        self.policy.optimizer.apply_gradients(zip(grads, variables))

        #: Adjust alpha
        entropy_diff = -1 * logprobs - self.target_entropy
        with tf.GradientTape() as tape:
            tape.watch(self.log_alpha)
            selected_actions, logprobs = self.policy.sample_action(states)
            alpha_loss = tf.reduce_mean(tf.exp(self.log_alpha) * entropy_diff)

        grad = tape.gradient(alpha_loss, self.log_alpha)
        self.alpha_optimizer.apply_gradients([(grad, self.log_alpha)])

        #: Soft target update
        self.target_dualqnet.set_weights(
           (1 - self.TAU) * np.array(self.target_dualqnet.get_weights())
           + self.TAU * np.array(self.duqlqnet.get_weights())
           )

    def save_model(self):

        self.policy.save_weights("checkpoints/actor")

        self.duqlqnet.save_weights("checkpoints/critic")

    def load_model(self):

        self.policy.load_weights("checkpoints/actor")

        self.duqlqnet.load_weights("checkpoints/critic")

        self.target_dualqnet.load_weights("checkpoints/critic")

    def testplay(self, n=1, monitordir=None):

        if monitordir:
            env = wrappers.Monitor(gym.make(self.env_id),
                                   monitordir, force=True,
                                   video_callable=(lambda ep: True))
        else:
            env = gym.make(self.env_id)

        total_rewards = []

        for _ in range(n):

            state = env.reset()

            done = False

            total_reward = 0

            while not done:

                action, _ = self.policy.sample_action(np.atleast_2d(state))

                action = action.numpy()[0]

                next_state, reward, done, _ = env.step(action)

                total_reward += reward

                if done:
                    break
                else:
                    state = next_state

            total_rewards.append(total_reward)
            print()
            print(total_reward)
            print()

        return total_rewards
class SACAgent:
  
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.env = env        
        self.action_range = [env.action_space.low, env.action_space.high]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau       

        # initialize networks 
        self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device)
        self.policy_net = GaussianPolicy(self.obs_dim, self.action_dim).to(self.device)
        
        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
            target_param.data.copy_(param)
            
        # initialize optimizers 
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)

        self.replay_buffer = Buffer(buffer_maxlen)
        
    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        mean, log_std = self.policy_net.forward(state)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)
        action = action.cpu().detach().squeeze(0).numpy()

        return action
    

    def update(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size)
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)
        
        _, _, next_zs, next_log_pi = self.policy_net.sample(next_states)
        next_actions = torch.tanh(next_zs)
        next_q1 = self.q_net1(next_states, next_actions)
        next_q2 = self.q_net2(next_states, next_actions)
        next_v = self.target_value_net(next_states)
        
        # value Loss
        next_v_target = torch.min(next_q1, next_q2) - next_log_pi
        curr_v = self.value_net.forward(states)
        v_loss = F.mse_loss(curr_v, next_v_target.detach())
        
        # q loss
        curr_q1 = self.q_net1.forward(states, actions)
        curr_q2 = self.q_net2.forward(states, actions)
        expected_q = rewards + (1 - dones) * self.gamma * next_v
        q1_loss = F.mse_loss(curr_q1, expected_q.detach())
        q2_loss = F.mse_loss(curr_q2, expected_q.detach())
        
        # update value network and q networks
        self.value_optimizer.zero_grad()
        v_loss.backward()
        self.value_optimizer.step()
        
        self.q1_optimizer.zero_grad()
        q1_loss.backward()
        self.q1_optimizer.step()
        
        self.q2_optimizer.zero_grad()
        q2_loss.backward()
        self.q2_optimizer.step()
        
        _, _, new_zs, log_pi = self.policy_net.sample(states)
        new_actions = torch.tanh(new_zs)
        min_q = torch.min(
            self.q_net1.forward(states, new_actions),
            self.q_net2.forward(states, new_actions)
        )
        policy_loss = (log_pi - min_q).mean()
        
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()
    
        # target networks
        for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
            target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param)
Beispiel #19
0
    def __init__(
        self,
        state_shape,
        action_dim,
        max_action,
        save_freq,
        discount=0.99,
        tau=0.005,
        actor_freq=2,
        lr=3e-4,
        entropy_tune=False,
        seed=0,
    ):

        self.rng = PRNGSequence(seed)

        actor_input_dim = [((1, *state_shape), jnp.float32)]
        critic_input_dim = self.critic_input_dim = [
            ((1, *state_shape), jnp.float32),
            ((1, action_dim), jnp.float32),
        ]
        self.actor = None
        self.critic = None
        self.log_alpha = None
        self.entropy_tune = entropy_tune
        self.target_entropy = -action_dim

        self.adam = Optimizers(
            actor=optim.Adam(learning_rate=lr),
            critic=optim.Adam(learning_rate=lr),
            log_alpha=optim.Adam(learning_rate=lr),
        )
        self.module = Modules(
            actor=GaussianPolicy.partial(action_dim=action_dim,
                                         max_action=max_action),
            critic=DoubleCritic.partial(),
            alpha=Constant.partial(start_value=1),
        )
        self.optimizer = None

        self.max_action = max_action
        self.discount = discount
        self.tau = tau
        self.policy_freq = actor_freq
        self.save_freq = save_freq

        self.total_it = 0
        self.model = None

        def new_params(module: nn.Module, shape=None):
            _, params = (module.init(next(self.rng)) if shape is None else
                         module.init_by_shape(next(self.rng), shape))
            return params

        def new_model(module: nn.Module, shape=None) -> nn.Model:
            return nn.Model(module, new_params(module, shape))

        def update_model(model: nn.Model, shape=None) -> nn.Model:
            return model.replace(params=new_params(model.module, shape))

        def reset_models() -> Models:
            if self.model is None:
                critic = new_model(self.module.critic, critic_input_dim)
                return Models(
                    actor=new_model(self.module.actor, actor_input_dim),
                    critic=critic,
                    target_critic=critic.replace(params=critic.params),
                    alpha=new_model(self.module.alpha),
                )
            else:
                critic = update_model(self.model.critic, critic_input_dim)
                return Models(
                    actor=update_model(self.model.actor, actor_input_dim),
                    critic=critic,
                    target_critic=critic.replace(params=critic.params),
                    alpha=update_model(self.model.alpha),
                )

        self.reset_models = reset_models

        def reset_optimizer(adam: Adam, model: nn.Model) -> Optimizer:
            return jax.device_put(adam.create(model))

        def reset_optimizers() -> Optimizers:
            return Optimizers(
                actor=reset_optimizer(self.adam.actor, self.model.actor),
                critic=reset_optimizer(self.adam.critic, self.model.critic),
                log_alpha=reset_optimizer(self.adam.log_alpha,
                                          self.model.alpha),
            )

        self.reset_optimizers = reset_optimizers
        self.i = 0