Exemple #1
0
    def __init__(self, model_config, action_range):
        super(ContGaussianPolicy, self).__init__()
        self.model = Model(model_config)

        action_low, action_high = action_range
        self.action_scale = torch.as_tensor((action_high - action_low) / 2,
                                            dtype=torch.float32)
        self.action_bias = torch.as_tensor((action_high + action_low) / 2,
                                           dtype=torch.float32)
Exemple #2
0
    def __init__(self,
                 policy_config,
                 value_config,
                 sa_config,
                 sas_config,
                 source_env,
                 target_env,
                 device,
                 log_dir="latest_runs",
                 memory_size=1e5,
                 warmup_games=10,
                 batch_size=64,
                 lr=0.0001,
                 gamma=0.99,
                 tau=0.003,
                 alpha=0.2,
                 ent_adj=False,
                 delta_r_scale=1.0,
                 s_t_ratio=10,
                 noise_scale=1.0,
                 target_update_interval=1,
                 n_games_til_train=1,
                 n_updates_per_train=1):
        super(DARC,
              self).__init__(policy_config, value_config, source_env, device,
                             log_dir, memory_size, None, batch_size, lr, gamma,
                             tau, alpha, ent_adj, target_update_interval, None,
                             n_updates_per_train)
        self.delta_r_scale = delta_r_scale
        self.s_t_ratio = s_t_ratio
        self.noise_scale = noise_scale

        self.source_env = source_env
        self.target_env = target_env

        self.warmup_games = warmup_games
        self.n_games_til_train = n_games_til_train

        self.sa_classifier = Model(sa_config).to(self.device)
        self.sa_classifier_opt = Adam(self.sa_classifier.parameters(), lr=lr)
        self.sas_adv_classifier = Model(sas_config).to(self.device)
        self.sas_adv_classifier_opt = Adam(
            self.sas_adv_classifier.parameters(), lr=lr)

        self.source_step = 0
        self.target_step = 0
        self.source_memory = self.memory
        self.target_memory = ReplayBuffer(self.memory_size, self.batch_size)
Exemple #3
0
class ContGaussianPolicy(nn.Module):
    def __init__(self, model_config, action_range):
        super(ContGaussianPolicy, self).__init__()
        self.model = Model(model_config)

        action_low, action_high = action_range
        self.action_scale = torch.as_tensor((action_high - action_low) / 2,
                                            dtype=torch.float32)
        self.action_bias = torch.as_tensor((action_high + action_low) / 2,
                                           dtype=torch.float32)

    def forward(self, states):
        mu, log_std = self.model(states)
        log_std = torch.clamp(log_std, min=-20, max=2)
        return mu, log_std

    def sample(self, states):
        mus, log_stds = self.forward(states)
        stds = torch.exp(log_stds)

        normal_dists = distributions.Normal(mus, stds)
        outputs = normal_dists.rsample()
        tanh_outputs = torch.tanh(outputs)
        actions = self.action_scale * tanh_outputs + self.action_bias
        mean_actions = self.action_scale * torch.tanh(mus) + self.action_bias

        log_probs = normal_dists.log_prob(outputs)
        # https://arxiv.org/pdf/1801.01290.pdf appendix C
        log_probs -= torch.log(
            self.action_scale *
            (torch.ones_like(tanh_outputs, requires_grad=False) -
             tanh_outputs.pow(2)) + 1e-6)
        log_probs = log_probs.sum(1, keepdim=True)

        return actions, log_probs, mean_actions

    def to(self, *args, **kwargs):
        device = args[0]
        self.action_scale = self.action_scale.to(device)
        self.action_bias = self.action_bias.to(device)
        self.model = self.model.to(device)
        return super(ContGaussianPolicy, self).to(device)
Exemple #4
0
 def __init__(self, model_config):
     super(DiscreteGaussianPolicy, self).__init__()
     self.model = Model(model_config)
Exemple #5
0
 def __init__(self, model_config):
     super().__init__()
     self.model = Model(model_config)
Exemple #6
0
class DARC(ContSAC):
    def __init__(self,
                 policy_config,
                 value_config,
                 sa_config,
                 sas_config,
                 source_env,
                 target_env,
                 device,
                 log_dir="latest_runs",
                 memory_size=1e5,
                 warmup_games=10,
                 batch_size=64,
                 lr=0.0001,
                 gamma=0.99,
                 tau=0.003,
                 alpha=0.2,
                 ent_adj=False,
                 delta_r_scale=1.0,
                 s_t_ratio=10,
                 noise_scale=1.0,
                 target_update_interval=1,
                 n_games_til_train=1,
                 n_updates_per_train=1):
        super(DARC,
              self).__init__(policy_config, value_config, source_env, device,
                             log_dir, memory_size, None, batch_size, lr, gamma,
                             tau, alpha, ent_adj, target_update_interval, None,
                             n_updates_per_train)
        self.delta_r_scale = delta_r_scale
        self.s_t_ratio = s_t_ratio
        self.noise_scale = noise_scale

        self.source_env = source_env
        self.target_env = target_env

        self.warmup_games = warmup_games
        self.n_games_til_train = n_games_til_train

        self.sa_classifier = Model(sa_config).to(self.device)
        self.sa_classifier_opt = Adam(self.sa_classifier.parameters(), lr=lr)
        self.sas_adv_classifier = Model(sas_config).to(self.device)
        self.sas_adv_classifier_opt = Adam(
            self.sas_adv_classifier.parameters(), lr=lr)

        self.source_step = 0
        self.target_step = 0
        self.source_memory = self.memory
        self.target_memory = ReplayBuffer(self.memory_size, self.batch_size)

    def train_step(self, s_states, s_actions, s_rewards, s_next_states,
                   s_done_masks, *args):
        t_states, t_actions, _, t_next_states, _, game_count = args
        if not torch.is_tensor(s_states):
            s_states = torch.as_tensor(s_states,
                                       dtype=torch.float32).to(self.device)
            s_actions = torch.as_tensor(s_actions,
                                        dtype=torch.float32).to(self.device)
            s_rewards = torch.as_tensor(s_rewards[:, np.newaxis],
                                        dtype=torch.float32).to(self.device)
            s_next_states = torch.as_tensor(
                s_next_states, dtype=torch.float32).to(self.device)
            s_done_masks = torch.as_tensor(s_done_masks[:, np.newaxis],
                                           dtype=torch.float32).to(self.device)

            t_states = torch.as_tensor(t_states,
                                       dtype=torch.float32).to(self.device)
            t_actions = torch.as_tensor(t_actions,
                                        dtype=torch.float32).to(self.device)
            t_next_states = torch.as_tensor(
                t_next_states, dtype=torch.float32).to(self.device)

        with torch.no_grad():
            sa_inputs = torch.cat([s_states, s_actions], 1)
            sas_inputs = torch.cat([s_states, s_actions, s_next_states], 1)
            sa_logits = self.sa_classifier(
                sa_inputs +
                gen_noise(self.noise_scale, sa_inputs, self.device))
            sas_logits = self.sas_adv_classifier(
                sas_inputs +
                gen_noise(self.noise_scale, sas_inputs, self.device))
            sa_log_probs = torch.log(torch.softmax(sa_logits, dim=1) + 1e-12)
            sas_log_probs = torch.log(
                torch.softmax(sas_logits + sa_logits, dim=1) + 1e-12)

            delta_r = sas_log_probs[:,
                                    1] - sas_log_probs[:,
                                                       0] - sa_log_probs[:,
                                                                         1] + sa_log_probs[:,
                                                                                           0]
            if game_count >= 2 * self.warmup_games:
                s_rewards = s_rewards + self.delta_r_scale * delta_r.unsqueeze(
                    1)

        train_info = super(DARC,
                           self).train_step(s_states, s_actions, s_rewards,
                                            s_next_states, s_done_masks)

        s_sa_inputs = torch.cat([s_states, s_actions], 1)
        s_sas_inputs = torch.cat([s_states, s_actions, s_next_states], 1)
        t_sa_inputs = torch.cat([t_states, t_actions], 1)
        t_sas_inputs = torch.cat([t_states, t_actions, t_next_states], 1)
        s_sa_logits = self.sa_classifier(
            s_sa_inputs +
            gen_noise(self.noise_scale, s_sa_inputs, self.device))
        s_sas_logits = self.sas_adv_classifier(
            s_sas_inputs +
            gen_noise(self.noise_scale, s_sas_inputs, self.device))
        t_sa_logits = self.sa_classifier(
            t_sa_inputs +
            gen_noise(self.noise_scale, t_sa_inputs, self.device))
        t_sas_logits = self.sas_adv_classifier(
            t_sas_inputs +
            gen_noise(self.noise_scale, t_sas_inputs, self.device))

        loss_function = nn.CrossEntropyLoss()
        label_zero = torch.zeros((s_sa_logits.shape[0], ),
                                 dtype=torch.int64).to(self.device)
        label_one = torch.ones((t_sa_logits.shape[0], ),
                               dtype=torch.int64).to(self.device)
        classify_loss = loss_function(s_sa_logits, label_zero)
        classify_loss += loss_function(t_sa_logits, label_one)
        classify_loss += loss_function(s_sas_logits, label_zero)
        classify_loss += loss_function(t_sas_logits, label_one)

        self.sa_classifier_opt.zero_grad()
        self.sas_adv_classifier_opt.zero_grad()
        classify_loss.backward()
        self.sa_classifier_opt.step()
        self.sas_adv_classifier_opt.step()

        s_sa_acc = 1 - torch.argmax(s_sa_logits, dim=1).double().mean()
        s_sas_acc = 1 - torch.argmax(s_sas_logits, dim=1).double().mean()
        t_sa_acc = torch.argmax(t_sa_logits, dim=1).double().mean()
        t_sas_acc = torch.argmax(t_sas_logits, dim=1).double().mean()

        train_info['Loss/Classify Loss'] = classify_loss
        train_info['Stats/Avg Delta Reward'] = delta_r.mean()
        train_info['Stats/Avg Source SA Acc'] = s_sa_acc
        train_info['Stats/Avg Source SAS Acc'] = s_sas_acc
        train_info['Stats/Avg Target SA Acc'] = t_sa_acc
        train_info['Stats/Avg Target SAS Acc'] = t_sas_acc
        return train_info

    def train(self, num_games, deterministic=False):
        self.policy.train()
        self.twin_q.train()
        self.sa_classifier.train()
        self.sas_adv_classifier.train()
        for i in range(num_games):
            source_reward, source_step = self.simulate_env(
                i, "source", deterministic)

            if i < self.warmup_games or i % self.s_t_ratio == 0:
                target_reward, target_step = self.simulate_env(
                    i, "target", deterministic)
                self.writer.add_scalar('Target Env/Rewards', target_reward, i)
                self.writer.add_scalar('Target Env/N_Steps', target_step, i)
                print("TARGET: index: {}, steps: {}, total_rewards: {}".format(
                    i, target_step, target_reward))

            if i >= self.warmup_games:
                self.writer.add_scalar('Source Env/Rewards', source_reward, i)
                self.writer.add_scalar('Source Env/N_Steps', source_step, i)
                if i % self.n_games_til_train == 0:
                    for _ in range(source_step * self.n_updates_per_train):
                        self.total_train_steps += 1
                        s_s, s_a, s_r, s_s_, s_d = self.source_memory.sample()
                        t_s, t_a, t_r, t_s_, t_d = self.target_memory.sample()
                        train_info = self.train_step(s_s, s_a, s_r, s_s_, s_d,
                                                     t_s, t_a, t_r, t_s_, t_d,
                                                     i)
                        self.writer.add_train_step_info(train_info, i)
                    self.writer.write_train_step()

            print("SOURCE: index: {}, steps: {}, total_rewards: {}".format(
                i, source_step, source_reward))

    def simulate_env(self, game_count, env_name, deterministic):
        if env_name == "source":
            env = self.source_env
            memory = self.source_memory
        elif env_name == "target":
            env = self.target_env
            memory = self.target_memory
        else:
            raise Exception("Env name not recognized")

        total_rewards = 0
        n_steps = 0
        done = False
        state = env.reset()
        while not done:
            if game_count <= self.warmup_games:
                action = env.action_space.sample()
            else:
                action = self.get_action(state, deterministic)
            next_state, reward, done, _ = env.step(action)
            done_mask = 1.0 if n_steps == env._max_episode_steps - 1 else float(
                not done)

            memory.add(state, action, reward, next_state, done_mask)

            if env_name == "source":
                self.source_step += 1
            elif env_name == "target":
                self.target_step += 1
            n_steps += 1
            total_rewards += reward
            state = next_state
        return total_rewards, n_steps

    def save_model(self, folder_name):
        super(DARC, self).save_model(folder_name)

        path = 'saved_weights/' + folder_name
        torch.save(self.sa_classifier.state_dict(), path + '/sa_classifier')
        torch.save(self.sas_adv_classifier.state_dict(),
                   path + '/sas_adv_classifier')

    # Load model parameters
    def load_model(self, folder_name, device):
        super(DARC, self).load_model(folder_name, device)

        path = 'saved_weights/' + folder_name
        self.sa_classifier.load_state_dict(
            torch.load(path + '/sa_classifier',
                       map_location=torch.device(device)))
        self.sas_adv_classifier.load_state_dict(
            torch.load(path + '/sas_adv_classifier',
                       map_location=torch.device(device)))