Example #1
0
    def forward(self,
                observation,
                reparameterize=True,
                deterministic=False,
                return_log_prob=False):
        """
        Forward pass.
        Assumes input is a torch tensor.

        :type observation: torch.Tensor
        """
        layer_input = observation
        for fc in self.fcs:
            layer_input = self.hidden_activation(fc(layer_input))
        network_output = self.output_activation(self.last_fc(layer_input))

        alpha = network_output[:, 0].unsqueeze(1) + EPSILON
        beta = network_output[:, 1].unsqueeze(1) + EPSILON
        distribution = Beta(alpha, beta)
        distribution_mean = distribution.mean
        if deterministic:
            sample = distribution.rsample()
        else:
            sample = distribution_mean
        # transform to range (min, max)
        action = self.min + self.max_min_difference * sample
        mean = self.min + self.max_min_difference * distribution_mean
        variance = self.max_min_difference_squared * distribution.variance
        std = torch.sqrt(variance)
        log_std = torch.log(std)
        log_prob = distribution.log_prob(sample)
        entropy = distribution.entropy()
        mean_action_log_prob = None
        pre_tanh_value = None
        return action, mean, log_std, log_prob, entropy, std, mean_action_log_prob, pre_tanh_value
Example #2
0
    def trainmodel(self):

        s = torch.tensor(self.memory.buffer['s'],
                         dtype=torch.double).to(device)
        a = torch.tensor(self.memory.buffer['a'],
                         dtype=torch.double).to(device)
        #r = torch.tensor(self.memory.buffer['r'], dtype=torch.double).to(device).view(-1, 1)
        s_ = torch.tensor(self.memory.buffer['s_'],
                          dtype=torch.double).to(device)
        #v = torch.tensor(self.memory.buffer['v'], dtype=torch.double).to(device).view(-1, 1)
        input = s_[-1].view(1, 4, 28, 28)
        future_value = self.net(input)[1].item()
        adv, target_v = self.getgae(future_value)

        adv = torch.tensor(np.array(adv),
                           dtype=torch.double).to(device).view(-1, 1)
        target_v = torch.tensor(target_v,
                                dtype=torch.double).to(device).view(-1, 1)
        adv = (adv - adv.mean()) / (adv.std() + 1e-5)
        old_a_logp = torch.tensor(self.memory.buffer['a_logp'],
                                  dtype=torch.double).to(device).view(-1, 1)

        for _ in range(self.PPOepoch):
            for index in BatchSampler(
                    SubsetRandomSampler(range(self.memory.buffer_capacity)),
                    self.memory.batch_size, False):

                alpha, beta = self.net(s[index])[0]
                dist = Beta(alpha, beta)
                a_logp = dist.log_prob(a[index]).sum(dim=1)
                a_logp = a_logp.reshape(-1, 1)
                ratio = torch.exp(a_logp - old_a_logp[index])
                with torch.no_grad():
                    entrop = dist.entropy()

                surr1 = ratio * adv[index]
                surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
                                    1.0 + self.clip_param) * adv[index]
                action_loss = -torch.min(surr1, surr2).mean()
                value_loss = F.smooth_l1_loss(
                    self.net(s[index])[1], target_v[index])
                self.storeloss(action_loss, value_loss)
                action_loss = torch.clamp(action_loss, 0, 10)
                value_loss = torch.clamp(value_loss, 0, 10)
                loss = action_loss + 2. * value_loss - args.bound * entrop.mean(
                )

                self.optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(self.net.parameters(),
                                         self.max_grad_norm)
                self.optimizer.step()

        torch.save(self.net.state_dict(), self.path_t7)
Example #3
0
    def update(self, epochs, steps, total_obs, total_actions, advantage,
               real_values):

        total_obs_ = torch.from_numpy(total_obs).type(torch.FloatTensor)
        advantage_ = torch.from_numpy(advantage).type(torch.FloatTensor)
        real_values_ = torch.from_numpy(real_values).type(torch.FloatTensor)
        total_actions = torch.from_numpy(total_actions).type(torch.FloatTensor)

        for _ in range(epochs):
            inds = np.arange(steps)
            np.random.shuffle(inds)

            for t in range(steps):
                index = inds[t]

                alpha, beta, values_to_backprop = self.network(
                    total_obs_[index].unsqueeze(0))

                m = Beta(alpha, beta)
                action_taken_prob = m.log_prob(total_actions[index]).sum(
                    dim=1, keepdim=True)

                entropy = m.entropy()
                entropy = entropy.sum(dim=1)
                print(entropy)

                alpha, beta, _ = self.old_network(
                    total_obs_[index].unsqueeze(0))
                m_old = Beta(alpha, beta)
                old_action_taken_probs = m_old.log_prob(
                    total_actions[index]).sum(dim=1, keepdim=True)

                ratios = action_taken_prob / (old_action_taken_probs + 1e-5)

                surr1 = ratios * advantage_[index]
                surr2 = torch.clamp(ratios, min=(1. - .1),
                                    max=(1. + .1)) * advantage_[index]
                policy_loss = -torch.min(surr1, surr2)
                value_loss = ((values_to_backprop - real_values_[index])**2)
                #value_loss = F.smooth_l1_loss(values_to_backprop, real_values_[index])
                total_loss = policy_loss + value_loss - 0.01 * entropy
                print(total_loss)
                self.optimizer.zero_grad()
                total_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.network.parameters(), 0.5)
                self.optimizer.step()

        self.old_network.load_state_dict(self.dic_placeholder)
        self.dic_placeholder = self.network.state_dict()
        return (value_loss)
Example #4
0
class CarlaImgPolicy(nn.Module):
    def __init__(self, input_dim, action_dim, hidden_layer=[400, 300]):
        super(CarlaImgPolicy, self).__init__()
        self.main_actor = CarlaSimpleEncoder(latent_size=input_dim - 1)
        self.main_critic = CarlaSimpleEncoder(latent_size=input_dim - 1)
        actor_layer_size = [input_dim] + hidden_layer
        actor_feature_layers = nn.ModuleList([])
        for i in range(len(actor_layer_size) - 1):
            actor_feature_layers.append(
                nn.Linear(actor_layer_size[i], actor_layer_size[i + 1]))
            actor_feature_layers.append(nn.ReLU())
        self.actor = nn.Sequential(*actor_feature_layers)
        self.alpha_head = nn.Sequential(
            nn.Linear(hidden_layer[-1], action_dim), nn.Softplus())
        self.beta_head = nn.Sequential(nn.Linear(hidden_layer[-1], action_dim),
                                       nn.Softplus())

        critic_layer_size = [input_dim] + hidden_layer
        critic_layers = nn.ModuleList([])
        for i in range(len(critic_layer_size) - 1):
            critic_layers.append(
                nn.Linear(critic_layer_size[i], critic_layer_size[i + 1]))
            critic_layers.append(nn.ReLU())
        critic_layers.append(layer_init(nn.Linear(hidden_layer[-1], 1),
                                        gain=1))
        self.critic = nn.Sequential(*critic_layers)

    def forward(self, x, action=None):
        speed = x[:, -1:]
        x = x[:, :-1].view(-1, 3, 128,
                           128)  # image size in carla driving task is 128x128
        x1 = self.main_actor(x)
        x1 = torch.cat([x1, speed], dim=1)

        x2 = self.main_critic(x)
        x2 = torch.cat([x2, speed], dim=1)

        actor_features = self.actor(x1)
        alpha = self.alpha_head(actor_features) + 1
        beta = self.beta_head(actor_features) + 1
        self.dist = Beta(alpha, beta)
        if action is None:
            action = self.dist.sample()
        else:
            action = (action + 1) / 2
        action_log_prob = self.dist.log_prob(action).sum(-1)
        entropy = self.dist.entropy().sum(-1)
        value = self.critic(x2)
        return action * 2 - 1, action_log_prob, value.squeeze(-1), entropy
Example #5
0
    def forward(self, s, g, greedy=False, action_logit=None):
        """Produce an action"""
        c0, c1 = self.action_stats(s, g)
        action_mode = (c0 - 1) / (c0 + c1 - 2)
        m = Beta(c0, c1)

        # Sample.
        if action_logit is None:
            if greedy:
                action_logit = action_mode
            else:
                action_logit = m.sample()

            n_ent = -m.entropy().mean()
            lprobs = m.log_prob(action_logit)
            action = self.scale_action(action_logit)
            return action, action_logit, lprobs, n_ent

        # Evaluate the action previously taken
        else:
            n_ent = -m.entropy().mean(dim=1)
            lprobs = m.log_prob(action_logit)
            action = self.scale_action(action_logit)
            return lprobs, n_ent, action
Example #6
0
class BetaSeparatedPolicy(nn.Module):
    def __init__(self, input_dim, action_dim, hidden_layer=[64, 64]):
        super(BetaSeparatedPolicy, self).__init__()
        actor_layer_size = [input_dim] + hidden_layer
        alpha_feature_layers = nn.ModuleList([])
        beta_feature_layers = nn.ModuleList([])
        for i in range(len(actor_layer_size) - 1):
            alpha_feature_layers.append(
                nn.Linear(actor_layer_size[i], actor_layer_size[i + 1]))
            alpha_feature_layers.append(nn.ReLU())
            beta_feature_layers.append(
                nn.Linear(actor_layer_size[i], actor_layer_size[i + 1]))
            beta_feature_layers.append(nn.ReLU())
        self.alpha_body = nn.Sequential(*alpha_feature_layers)
        self.beta_body = nn.Sequential(*beta_feature_layers)
        self.alpha_head = nn.Sequential(
            nn.Linear(hidden_layer[-1], action_dim), nn.Softplus())
        self.beta_head = nn.Sequential(nn.Linear(hidden_layer[-1], action_dim),
                                       nn.Softplus())

        critic_layer_size = [input_dim] + hidden_layer
        critic_layers = nn.ModuleList([])
        for i in range(len(critic_layer_size) - 1):
            critic_layers.append(
                nn.Linear(critic_layer_size[i], critic_layer_size[i + 1]))
            critic_layers.append(nn.ReLU())
        critic_layers.append(nn.Linear(hidden_layer[-1], 1))
        self.critic = nn.Sequential(*critic_layers)

    def forward(self, x, action=None):
        alpha = self.alpha_head(self.alpha_body(x)) + 1
        beta = self.beta_head(self.beta_body(x)) + 1
        self.dist = Beta(alpha, beta)
        if action is None:
            action = self.dist.sample()
        else:
            action = (action + 1) / 2
        action_log_prob = self.dist.log_prob(action).sum(-1)
        entropy = self.dist.entropy().sum(-1)
        value = self.critic(x)

        return action * 2 - 1, action_log_prob, value.squeeze(-1), entropy
Example #7
0
class MyDist(ActionDistribution):
    @staticmethod
    def required_model_output_shape(action_space, model_config):
        return 6

    def __init__(self, inputs, model):
        super(MyDist, self).__init__(inputs, model)
        self.dist = Beta(inputs[:, :3], inputs[:, 3:])

    def sample(self):
        self.sampled_action = self.dist.sample()
        return self.sampled_action

    def deterministic_sample(self):
        return self.dist.mean

    def sampled_action_logp(self):
        return self.logp(self.sampled_action)

    def logp(self, actions):
        return self.dist.log_prob(actions).sum(-1)

    # refered from https://github.com/pytorch/pytorch/blob/master/torch/distributions/kl.py
    def kl(self, other):
        p, q = self.dist, other.dist
        sum_params_p = p.concentration1 + p.concentration0
        sum_params_q = q.concentration1 + q.concentration0
        t1 = q.concentration1.lgamma() + q.concentration0.lgamma() + (
            sum_params_p).lgamma()
        t2 = p.concentration1.lgamma() + p.concentration0.lgamma() + (
            sum_params_q).lgamma()
        t3 = (p.concentration1 - q.concentration1) * torch.digamma(
            p.concentration1)
        t4 = (p.concentration0 - q.concentration0) * torch.digamma(
            p.concentration0)
        t5 = (sum_params_q - sum_params_p) * torch.digamma(sum_params_p)
        return (t1 - t2 + t3 + t4 + t5).sum(-1)

    def entropy(self):
        return self.dist.entropy().sum(-1)
Example #8
0
    def run(self):
        updatestep = 0
        update = 0
        i_episode = 0

        while (update < 100000):
            self.lr = args.lr - (args.lr * (i_episode / float(10000)))
            i_episode = i_episode + 1
            observation = self.env.reset()
            step = 0
            observes_list, rewards, actions, values, old_log = [], [], [], [], []

            if updatestep > 2048:
                update = update + 1
                updatestep = 0
                if (args.usegae):
                    self.add_gae(self.trajectories, self.gamma, self.lam)
                else:
                    self.add_no_gae(self.trajectories, self.gamma)
                s, a, adv, old_a_logp, target_v, totalsize = self.gettraindata(
                )
                minibatch = max(totalsize // args.numminibatch, 1)

                for _ in range(self.PPOepoch):
                    for index in BatchSampler(
                            SubsetRandomSampler(range(totalsize)), minibatch,
                            False):

                        alpha, beta = self.net(s[index])[0]
                        dist = Beta(alpha, beta)
                        a_logp = dist.log_prob(a[index]).sum(dim=1)
                        ratio = torch.exp(a_logp - old_a_logp[index])
                        with torch.no_grad():
                            entrop = dist.entropy()

                        surr1 = ratio * adv[index]
                        surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
                                            1.0 + self.clip_param) * adv[index]
                        action_loss = -torch.min(surr1, surr2).mean()
                        value_loss = F.mse_loss(
                            self.net(s[index])[1], target_v[index])
                        self.storeloss(action_loss, value_loss)
                        loss = action_loss + 0.5 * value_loss - 0.01 * entrop.mean(
                        )

                        self.optimizer.zero_grad()
                        loss.backward()
                        nn.utils.clip_grad_norm_(self.net.parameters(),
                                                 args.maxgradnorm)
                        self.optimizer.step()

                self.trajectories = []

            while (1):
                step = step + 1
                updatestep = updatestep + 1
                #self.env.render()

                observes = observation.astype(np.float32).reshape((1, -1))
                input = torch.tensor(observes,
                                     dtype=torch.double).to(device).reshape(
                                         -1, self.inputsize)
                (alpha, beta), v = self.net(input)
                dist = Beta(alpha, beta)
                action = dist.sample()
                a_logp = dist.log_prob(action.view(-1, 6)).sum(dim=1)
                a_logp = a_logp.item()

                old_log.append(a_logp)
                values.append(v.item())
                observes_list.append(observes)
                actions.append(action)

                action = action.squeeze().cpu().numpy()
                observation, reward, done, info = self.env.step(action * 2 - 1)
                rewards.append(reward)

                if done:
                    print("Episode finished after {} timesteps, rewards is {}".
                          format(step, sum(rewards)))
                    self.storereward(format(step))

                    trajectory = {
                        'observes': np.concatenate([t for t in observes_list]),
                        'actions':
                        np.concatenate([t.to('cpu') for t in actions]),
                        'rewards': np.array(rewards),
                        'values': np.array(values),
                        'old_log': np.array(old_log)
                    }

                    self.trajectories.append(trajectory)
                    break
    def entropy(self, datas):
        alpha, beta = datas

        distribution = Beta(alpha, beta)
        return distribution.entropy().float().to(set_device(self.use_gpu))
Example #10
0
    def run(self):
        for i_episode in range(10000 * self.envpoch):
            observation = self.env.reset()
            step = 0
            observes_list = []
            rewards = []
            actions = []
            values = []
            old_log = []
            if i_episode % 20 == 19:
                self.add_gae(self.trajectories, self.gamma, self.lam)
                s, a, adv, old_a_logp, target_v, totalsize = self.gettraindata(
                )
                minibatch = max(totalsize // args.numminibatch, 1)

                for _ in range(self.PPOepoch):
                    for index in BatchSampler(
                            SubsetRandomSampler(range(totalsize)), minibatch,
                            False):

                        alpha, beta = self.net(s[index])[0]
                        dist = Beta(alpha, beta)
                        a_logp = dist.log_prob(a[index]).sum(-1, keepdim=True)
                        ratio = torch.exp(a_logp - old_a_logp[index])
                        with torch.no_grad():
                            entrop = dist.entropy()

                        surr1 = ratio * adv[index]
                        surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
                                            1.0 + self.clip_param) * adv[index]
                        action_loss = -torch.min(surr1, surr2).mean()
                        value_loss = F.smooth_l1_loss(
                            self.net(s[index])[1], target_v[index])
                        self.storeloss(action_loss, value_loss)
                        action_loss = torch.clamp(action_loss, 0, 1)
                        value_loss = torch.clamp(value_loss, 0, 1)
                        loss = action_loss + 2. * value_loss - 0.01 * entrop.mean(
                        )

                        self.optimizer.zero_grad()
                        loss.backward()
                        nn.utils.clip_grad_norm_(self.net.parameters(), 5)
                        self.optimizer.step()

                self.trajectories = []

            while (1):
                step = step + 1
                #self.env.render()

                observes = observation.astype(np.float32).reshape((1, -1))
                observes = np.append(observes, [[step]], axis=1)
                input = torch.tensor(observes,
                                     dtype=torch.double).to(device).reshape(
                                         -1, 18)
                observes_list.append(observes)

                (alpha, beta), v = self.net(input)
                values.append(v.item())
                dist = Beta(alpha, beta)
                action = dist.sample()
                actions.append(action)

                a_logp = dist.log_prob(action.view(-1, 6)).sum(dim=1)
                action = action.squeeze().cpu().numpy()
                a_logp = a_logp.item()
                old_log.append(a_logp)

                observation, reward, done, info = self.env.step(action * 2 - 1)
                rewards.append(reward)

                if done:
                    print("Episode finished after {} timesteps".format(step))
                    self.storereward(format(step))
                    observes = observation.astype(np.float32).reshape((1, -1))
                    observes = np.append(observes, [[step + 1]], axis=1)
                    input = torch.tensor(
                        observes,
                        dtype=torch.double).to(device).reshape(-1, 18)
                    (mean, std), v = self.net(input)
                    values.append(v.item())
                    obs = np.concatenate([t for t in observes_list])

                    acs = np.concatenate([t.to('cpu') for t in actions])
                    res = np.array(rewards)
                    vas = np.array(values[1:])
                    olog = np.array(old_log)
                    self.scaler.update(obs)
                    scale, offset = self.scaler.get()
                    scale[-1] = 1.0  # don't scale time step feature
                    offset[-1] = 0.0  # don't offset time step feature
                    obs = (obs - offset) * scale
                    trajectory = {
                        'observes': obs,
                        'actions': acs,
                        'rewards': res,
                        'values': vas,
                        'old_log': olog
                    }
                    self.trajectories.append(trajectory)
                    break
Example #11
0
    def update(self):
        """
        Update policy gradient by using old batch of experience.
        This happens when the buffer is full
        """
        self.training_step += 1
        s = torch.tensor(self.buffer['s'], dtype=torch.float).to(device)
        a = torch.tensor(self.buffer['a'], dtype=torch.float).to(device)
        r = torch.tensor(self.buffer['r'],
                         dtype=torch.float).to(device).view(-1, 1)
        s_ = torch.tensor(self.buffer['s_'], dtype=torch.float).to(device)
        if args.action_vec > 0:
            a_v = torch.tensor(self.buffer['a_v'],
                               dtype=torch.float).to(device)
        """
        print("Weights before update: ")
        for k, v in self.net.state_dict().items():
            print("Layer {}".format(k))
            print(v.sum())
            print(v.mean(), v.median())
            print(v.max(), v.min())
        """
        if args.vae and args.tb:
            z = self.net.get_z(s[0].unsqueeze_(0))
            dec2 = self.net.vae.decode(z).squeeze(0)
            imgs = torch.cat((dec2, s[0]), dim=2)
            img_grid = torchvision.utils.make_grid(imgs)
            writer.add_image("Encoder-Img", img_grid)

            #save_image(s[0].cpu(), 'test_img/' + args.title + "_" + str(args.ndim) + 'img_update_' + str(self.training_step) + '.png')

        old_a_logp = torch.tensor(self.buffer['a_logp'],
                                  dtype=torch.float).to(device).view(-1, 1)

        with torch.no_grad():  # Compute a vector with advantage terms
            if args.action_vec > 0:
                target_v = r + args.gamma * self.net((s_, a_v[:, 3:]))[1]
                adv = target_v - self.net((s_, a_v[:, :-3]))[1]
            else:
                target_v = r + args.gamma * self.net(s_)[1]
                adv = target_v - self.net(s)[1]
            # adv = (adv - adv.mean()) / (adv.std() + 1e-8)

        for _ in range(self.ppo_epoch):
            # Compute update for mini batch
            for index in BatchSampler(
                    SubsetRandomSampler(range(self.buffer_capacity)),
                    self.batch_size, False):
                if args.action_vec > 0:
                    alpha, beta = self.net((s[index], a_v[index, :-3]))[0]
                else:
                    alpha, beta = self.net(s[index])[0]
                dist = Beta(alpha, beta)
                entropy = dist.entropy().mean()
                a_logp = dist.log_prob(a[index]).sum(dim=1, keepdim=True)
                ratio = torch.exp(a_logp - old_a_logp[index]
                                  )  # old/new_policy for Trust Region Method

                surr1 = ratio * adv[index]
                surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 +
                                    self.clip_param) * adv[index]  # Clip Ratio
                action_loss = -torch.min(surr1, surr2).mean()
                # Difference between prediction and real values
                if args.action_vec > 0:
                    value_loss = F.smooth_l1_loss(
                        self.net((s[index], a_v[index, :-3]))[1],
                        target_v[index])
                else:
                    value_loss = F.smooth_l1_loss(
                        self.net(s[index])[1], target_v[index])
                #loss = action_loss + 2. * value_loss
                # Loss with Entropy
                loss = action_loss + 2. * value_loss - 0.001 * entropy

                self.optimizer.zero_grad()
                loss.backward()
                # nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm)
                self.optimizer.step()

        if args.vae:
            z = self.net.get_z(s[0].unsqueeze_(0))
            dec2 = self.net.vae.decode(z)
            save_image(
                dec2, 'test_img/' + args.title + "_" + str(args.ndim) +
                '_dec_final_' + str(self.training_step) + '.png')