Beispiel #1
0
    def forward(self, flat_obs, actions=None):
        obs, taus = split_tau(flat_obs)
        if actions is not None:
            h = torch.cat((obs, actions), dim=1)
        else:
            h = obs
        batch_size = h.size()[0]
        y_binary = ptu.FloatTensor(batch_size, self.max_tau + 1)
        y_binary.zero_()
        t = taus.data.long()
        t = torch.clamp(t, min=0)
        y_binary.scatter_(1, t, 1)
        if actions is not None:
            h = torch.cat((
                obs,
                ptu.Variable(y_binary),
                actions
            ), dim=1)
        else:
            h = torch.cat((
                obs,
                ptu.Variable(y_binary),
            ), dim=1)

        for i, fc in enumerate(self.fcs):
            h = self.hidden_activation(fc(h))
        return - torch.abs(self.last_fc(h))
Beispiel #2
0
    def test_epoch(
        self,
        epoch,
    ):
        self.model.eval()
        val_losses = []
        per_dim_losses = np.zeros((self.num_batches, self.y_train.shape[1]))
        for batch in range(self.num_batches):
            inputs_np, labels_np = self.random_batch(
                self.X_test, self.y_test, batch_size=self.batch_size)
            inputs, labels = ptu.Variable(
                ptu.from_numpy(inputs_np)), ptu.Variable(
                    ptu.from_numpy(labels_np))
            outputs = self.model(inputs)
            loss = self.criterion(outputs, labels)
            val_losses.append(loss.data[0])
            per_dim_loss = np.mean(np.power(ptu.get_numpy(outputs - labels),
                                            2),
                                   axis=0)
            per_dim_losses[batch] = per_dim_loss

        logger.record_tabular("test/epoch", epoch)
        logger.record_tabular("test/loss", np.mean(np.array(val_losses)))
        for i in range(self.y_train.shape[1]):
            logger.record_tabular("test/dim " + str(i) + " loss",
                                  np.mean(per_dim_losses[:, i]))
        logger.dump_tabular()
    def test_huber_loss_delta_3(self):
        criterion = modules.HuberLoss(3)

        x = np.array([
            [0],
        ])
        x_hat = np.array([
            [5],
        ])
        expected_loss = np.array([
            3 * (5 - 3 / 2),
        ])

        x_var = ptu.Variable(ptu.from_numpy(x).float())
        x_hat_var = ptu.Variable(ptu.from_numpy(x_hat).float())
        result_var = criterion(x_var, x_hat_var)
        result = ptu.get_numpy(result_var)
        self.assertNpAlmostEqual(expected_loss, result)

        x = np.array([
            [4],
        ])
        x_hat = np.array([
            [6],
        ])
        expected_loss = np.array([
            0.5 * 2 * 2,
        ])

        x_var = ptu.Variable(ptu.from_numpy(x).float())
        x_hat_var = ptu.Variable(ptu.from_numpy(x_hat).float())
        result_var = criterion(x_var, x_hat_var)
        result = ptu.get_numpy(result_var)
        self.assertNpAlmostEqual(expected_loss, result)
Beispiel #4
0
    def train_epoch(self, epoch):
        self.model.train()
        losses = []
        per_dim_losses = np.zeros((self.num_batches, self.y_train.shape[1]))
        for batch in range(self.num_batches):
            inputs_np, labels_np = self.random_batch(
                self.X_train, self.y_train, batch_size=self.batch_size)
            inputs, labels = ptu.Variable(
                ptu.from_numpy(inputs_np)), ptu.Variable(
                    ptu.from_numpy(labels_np))
            self.optimizer.zero_grad()
            outputs = self.model(inputs)
            loss = self.criterion(outputs, labels)
            loss.backward()
            self.optimizer.step()
            losses.append(loss.data[0])
            per_dim_loss = np.mean(np.power(ptu.get_numpy(outputs - labels),
                                            2),
                                   axis=0)
            per_dim_losses[batch] = per_dim_loss

        logger.record_tabular("train/epoch", epoch)
        logger.record_tabular("train/loss", np.mean(np.array(losses)))
        for i in range(self.y_train.shape[1]):
            logger.record_tabular("train/dim " + str(i) + " loss",
                                  np.mean(per_dim_losses[:, i]))
Beispiel #5
0
    def forward(self, flat_obs, actions=None):
        obs, taus = split_tau(flat_obs)
        if actions is not None:
            h = torch.cat((obs, actions), dim=1)
        else:
            h = obs
        batch_size = taus.size()[0]
        y_binary = make_binary_tensor(taus, len(self.max_tau), batch_size)

        if actions is not None:
            h = torch.cat((
                obs,
                ptu.Variable(y_binary),
                actions
            ), dim=1)
        else:
            h = torch.cat((
                obs,
                ptu.Variable(y_binary),

            ), dim=1)

        for i, fc in enumerate(self.fcs):
            h = self.hidden_activation(fc(h))
        return - torch.abs(self.last_fc(h))
Beispiel #6
0
    def __init__(self, matrix_input_size, vector_size):
        super().__init__()
        self.vector_size = vector_size

        self.L = nn.Linear(matrix_input_size, vector_size**2)
        self.L.weight.data.mul_(0.1)
        self.L.bias.data.mul_(0.1)
        self.tril_mask = ptu.Variable(
            torch.tril(torch.ones(vector_size, vector_size),
                       k=-1).unsqueeze(0))
        self.diag_mask = ptu.Variable(
            torch.diag(torch.diag(torch.ones(vector_size,
                                             vector_size))).unsqueeze(0))
Beispiel #7
0
    def forward(
        self,
        obs,
        deterministic=False,
        return_log_prob=False,
        return_entropy=False,
        return_log_prob_of_mean=False,
    ):
        obs, taus = split_tau(obs)
        h = obs
        batch_size = h.size()[0]
        y_binary = ptu.FloatTensor(batch_size, self.max_tau + 1)
        y_binary.zero_()
        t = taus.data.long()
        t = torch.clamp(t, min=0)
        y_binary.scatter_(1, t, 1)

        h = torch.cat((
            obs,
            ptu.Variable(y_binary),
        ), dim=1)

        return super().forward(
            obs=h,
            deterministic=deterministic,
            return_log_prob=return_log_prob,
            return_entropy=return_entropy,
            return_log_prob_of_mean=return_log_prob_of_mean,
        )
Beispiel #8
0
 def _encode(self, imgs, noisy, clip_std=None, batch_size=None):
     if batch_size is None:
         mu, logvar = self.vae.encode(ptu.np_to_var(imgs))
     else:
         imgs = imgs.reshape(-1, self.vae.imlength)
         n = imgs.shape[0]
         mu, logvar = None, None
         for i in range(0, n, batch_size):
             batch_mu, batch_logvar = self.vae.encode(
                 ptu.np_to_var(imgs[i:i + batch_size]))
             if mu is None:
                 mu = batch_mu
                 logvar = batch_logvar
             else:
                 mu = torch.cat((mu, batch_mu), dim=0)
                 logvar = torch.cat((logvar, batch_logvar), dim=0)
     std = logvar.mul(0.5).exp_()
     if clip_std is None:
         clip_std = self.clip_encoding_std
     if clip_std:
         vae_std = np.copy(self.vae.dist_std)
         vae_std = ptu.np_to_var(vae_std)
         std = torch.min(std, vae_std)
     if noisy:
         eps = ptu.Variable(std.data.new(std.size()).normal_())
         sample = eps.mul(std).add_(mu)
     else:
         sample = mu
     return ptu.get_numpy(sample), ptu.get_numpy(mu), ptu.get_numpy(std)
Beispiel #9
0
    def __init__(
            self,
            env,
            qf,
            replay_buffer,
            num_epochs=100,
            num_batches_per_epoch=100,
            qf_learning_rate=1e-3,
            batch_size=100,
            num_unique_batches=1000,
    ):
        self.qf = qf
        self.replay_buffer = replay_buffer
        self.env = env
        self.num_epochs = num_epochs
        self.num_batches_per_epoch = num_batches_per_epoch
        self.qf_learning_rate = qf_learning_rate
        self.batch_size = batch_size
        self.num_unique_batches = num_unique_batches

        self.qf_optimizer = optim.Adam(self.qf.parameters(),
                                       lr=self.qf_learning_rate)
        self.batch_iterator = None
        self.discount = ptu.Variable(
            ptu.from_numpy(np.zeros((batch_size, 1))).float()
        )
        self.mode_to_batch_iterator = {}
    def test_batch_square_diagonal_module(self):
        x = np.array([
            [2, 7],
        ])
        diag_vals = np.array([
            [2, 1],
        ])
        expected = np.array([[57]  # 2^2 * 2 + 7^2 * 1 = 8 + 49 = 57
                             ])

        x_var = ptu.Variable(ptu.from_numpy(x).float())
        diag_var = ptu.Variable(ptu.from_numpy(diag_vals).float())
        net = modules.BatchSquareDiagonal(2)
        result_var = net(vector=x_var, diag_values=diag_var)
        result = ptu.get_numpy(result_var)

        self.assertNpAlmostEqual(expected, result)
Beispiel #11
0
    def get_batch(self, train=True):
        if self.use_parallel_dataloading:
            if not train:
                dataloader = self.test_dataloader
            else:
                dataloader = self.train_dataloader
            samples = next(dataloader)
            return {
                'obs': ptu.Variable(samples[0][0]),
                'actions': ptu.Variable(samples[1][0]),
                'next_obs': ptu.Variable(samples[2][0]),
            }

        dataset = self.train_dataset if train else self.test_dataset
        ind = np.random.randint(0, len(dataset), self.batch_size)
        samples = normalize_image(dataset[ind, :])
        return ptu.np_to_var(samples)
    def get_train_dict(self, subtraj_batch):
        subtraj_rewards = subtraj_batch['rewards']
        subtraj_rewards_np = ptu.get_numpy(subtraj_rewards).squeeze(2)
        returns = np_util.batch_discounted_cumsum(subtraj_rewards_np,
                                                  self.discount)
        returns = np.expand_dims(returns, 2)
        returns = np.ascontiguousarray(returns).astype(np.float32)
        returns = ptu.Variable(ptu.from_numpy(returns))
        subtraj_batch['returns'] = returns
        batch = flatten_subtraj_batch(subtraj_batch)
        # rewards = batch['rewards']
        returns = batch['returns']
        terminals = batch['terminals']
        obs = batch['observations']
        actions = batch['actions']
        next_obs = batch['next_observations']
        """
        Policy operations.
        """
        policy_actions = self.policy(obs)
        q = self.qf(obs, policy_actions)
        policy_loss = -q.mean()
        """
        Critic operations.
        """
        next_actions = self.policy(next_obs)
        # TODO: try to get this to work
        # next_actions = None
        q_target = self.target_qf(
            next_obs,
            next_actions,
        )
        # y_target = self.reward_scale * rewards + (1. - terminals) * self.discount * v_target
        batch_size = q_target.size()[0]
        discount_factors = self.discount_factors.repeat(
            batch_size // self.subtraj_length,
            1,
        )
        y_target = self.reward_scale * returns + (
            1. - terminals) * discount_factors * q_target
        # noinspection PyUnresolvedReferences
        y_target = y_target.detach()
        y_pred = self.qf(obs, actions)
        bellman_errors = (y_pred - y_target)**2
        qf_loss = self.qf_criterion(y_pred, y_target)

        return OrderedDict([
            ('Policy Actions', policy_actions),
            ('Policy Loss', policy_loss),
            ('Policy Q Values', q),
            ('Target Y', y_target),
            ('Predicted Y', y_pred),
            ('Bellman Errors', bellman_errors),
            ('Y targets', y_target),
            ('Y predictions', y_pred),
            ('QF Loss', qf_loss),
        ])
Beispiel #13
0
    def compute_iwae_loss(self, x_recon, x, z_mu, z_logvar, z_sampled, beta):
        batch_size = x_recon.shape[0]
        log_p_xgz = self.logprob_iwae(x_recon, x).sum(dim=-1)

        prior_dist = torch.distributions.Normal(ptu.Variable(torch.zeros(z_sampled.shape)),
                                           ptu.Variable(torch.ones(z_sampled.shape)))
        log_p_z = prior_dist.log_prob(z_sampled).sum(dim=-1)

        z_std = torch.exp(0.5*z_logvar)
        encoder_dist = torch.distributions.Normal(z_mu, z_std)
        log_q_zgx = encoder_dist.log_prob(z_sampled).sum(dim=-1)

        log_w = log_p_xgz + beta * (log_p_z - log_q_zgx)
        w_tilde = F.softmax(log_w, dim=-1).detach()

        loss = -(log_w * w_tilde).sum() / batch_size

        return loss
Beispiel #14
0
 def get_encoding_and_suff_stats(self, x):
     output = self(x)
     means, log_stds = (
         output[:, 0:1], output[:, 1:2]
     )
     stds = log_stds.exp()
     epsilon = ptu.Variable(torch.randn(*means.size()))
     latents = epsilon * stds + means
     latents = latents
     return latents, means, log_stds, stds
Beispiel #15
0
    def __init__(
        self,
        obs_dim,
        action_dim,
        hidden_size,
        use_batchnorm=False,
        b_init_value=0.01,
        hidden_init=ptu.fanin_init,
        use_exp_for_diagonal_not_square=True,
    ):
        super(NafPolicy, self).__init__()
        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.use_batchnorm = use_batchnorm
        self.use_exp_for_diagonal_not_square = use_exp_for_diagonal_not_square

        if use_batchnorm:
            self.bn_state = nn.BatchNorm1d(obs_dim)
            self.bn_state.weight.data.fill_(1)
            self.bn_state.bias.data.fill_(0)

        self.linear1 = nn.Linear(obs_dim, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.V = nn.Linear(hidden_size, 1)
        self.mu = nn.Linear(hidden_size, action_dim)
        self.L = nn.Linear(hidden_size, action_dim**2)

        self.tril_mask = ptu.Variable(
            torch.tril(torch.ones(action_dim, action_dim), -1).unsqueeze(0))
        self.diag_mask = ptu.Variable(
            torch.diag(torch.diag(torch.ones(action_dim,
                                             action_dim))).unsqueeze(0))

        hidden_init(self.linear1.weight)
        self.linear1.bias.data.fill_(b_init_value)
        hidden_init(self.linear2.weight)
        self.linear2.bias.data.fill_(b_init_value)
        hidden_init(self.V.weight)
        self.V.bias.data.fill_(b_init_value)
        hidden_init(self.L.weight)
        self.L.bias.data.fill_(b_init_value)
        hidden_init(self.mu.weight)
        self.mu.bias.data.fill_(b_init_value)
def train_network(net, title):
    train_losses = []
    test_losses = []
    times = []

    optimizer = Adam(net.parameters(), lr=1e-3)
    criterion = nn.MSELoss()

    for i in range(N_EPOCHS):
        for i_batch, sample_batched in enumerate(dataloader):
            x, y = sample_batched
            x = ptu.Variable(x)
            y = ptu.Variable(y)
            y_hat = net(x)

            loss = criterion(y_hat, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        y_hat = net(test_x)
        test_loss = float(criterion(y_hat, test_y))
        test_losses.append(test_loss)

        y_hat = net(train_x)
        train_loss = float(criterion(y_hat, train_y))
        train_losses.append(train_loss)

        times.append(i)
        plt.gcf().clear()
        plt.plot(times, train_losses, '--')
        plt.plot(times, test_losses, '-')
        plt.title(title)
        plt.draw()
        plt.pause(0.05)
    print(title)
    print("\tfinal train loss: {}".format(train_loss))
    print("\tfinal test loss: {}".format(test_loss))
 def get_batch(self, training=True):
     replay_buffer = self.replay_buffer.get_replay_buffer(training)
     sample_size = min(replay_buffer.num_steps_can_sample(),
                       self.batch_size)
     batch = replay_buffer.random_batch(sample_size)
     torch_batch = {
         k: ptu.Variable(ptu.from_numpy(array).float(), requires_grad=False)
         for k, array in batch.items()
     }
     rewards = torch_batch['rewards']
     terminals = torch_batch['terminals']
     torch_batch['rewards'] = rewards.unsqueeze(-1)
     torch_batch['terminals'] = terminals.unsqueeze(-1)
     return torch_batch
Beispiel #18
0
    def rsample(self, return_pretanh_value=False):
        """
        Sampling in the reparameterization case.
        """
        z = self.normal_mean + \
            self.normal_std * \
            ptu.Variable(
                Normal(torch.zeros(self.normal_mean.size()), torch.ones(self.normal_std.size())).sample(),
                requires_grad=False)

        if return_pretanh_value:
            return torch.tanh(z), z
        else:
            return torch.tanh(z)
Beispiel #19
0
    def forward(self, flat_obs, actions=None):
        obs, taus = split_tau(flat_obs)
        if actions is not None:
            h = torch.cat((obs, action), dim=1)
        else:
            h = obs
        batch_size = h.size()[0]
        tau_vector = torch.zeros((batch_size, self.tau_vector_len)) + taus.data
        if actions is not None:
            h = torch.cat((
                obs,
                ptu.Variable(tau_vector),
                actions
            ), dim=1)
        else:
            h = torch.cat((
                obs,
                ptu.Variable(tau_vector),

            ), dim=1)

        for i, fc in enumerate(self.fcs):
            h = self.hidden_activation(fc(h))
        return - torch.abs(self.last_fc(h))
Beispiel #20
0
    def train(epoch):
        for batch_idx, (state, action, q_target) in enumerate(train_loader):
            q_estim = eval_model(state, action)
            q_target = ptu.Variable(q_target, requires_grad=False)

            loss = loss_fnct(q_estim, q_target)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if batch_idx % num_batches_per_print == 0:
                line_logger.print_over(
                    'Train Epoch: {} [{}/{}]\tLoss: {:.6f}'.format(
                        epoch, batch_size * batch_idx, train_size,
                        loss.data[0]))
Beispiel #21
0
    def forward(
            self,
            flat_obs,
            return_preactivations=False,
    ):
        obs, taus = split_tau(flat_obs)
        batch_size = taus.size()[0]
        y_binary = make_binary_tensor(taus, len(self.max_tau), batch_size)
        h = torch.cat((
            obs,
            ptu.Variable(y_binary),
        ), dim=1)

        return super().forward(
            h,
            return_preactivations=return_preactivations
        )
Beispiel #22
0
    def forward(
            self,
            flat_obs,
            return_preactivations=False,
        ):
        obs, taus = split_tau(flat_obs)
        h=obs
        batch_size = h.size()[0]
        tau_vector = torch.zeros((batch_size, self.tau_vector_len)) + taus.data
        h = torch.cat((
                obs,
                ptu.Variable(tau_vector),
            ), dim=1)

        return super().forward(
            h,
            return_preactivations=return_preactivations
        )
Beispiel #23
0
def dump_samples(vae_env, epoch, n_samples=64):
    from railrl.core import logger
    from torchvision.utils import save_image
    import os.path as osp
    vae_env.vae.eval()
    sample = ptu.Variable(torch.randn(n_samples, vae_env.representation_size))
    sample = vae_env.vae.decode(sample).cpu()
    if vae_env.vae_input_key_prefix == 'state':
        sample = ptu.np_to_var(vae_env.wrapped_env.states_to_images(ptu.get_numpy(sample)))
        if sample is None:
            return
    if epoch is not None:
        save_dir = osp.join(logger.get_snapshot_dir(), 's_%d.png' % epoch)
    else:
        save_dir = osp.join(logger.get_snapshot_dir(), 's.png')
    save_image(
        sample.data.view(n_samples, -1, vae_env.wrapped_env.imsize, vae_env.wrapped_env.imsize),
        save_dir,
        nrow=int(np.sqrt(n_samples))
    )
 def __init__(self, *args, subtraj_length=10, **kwargs):
     super().__init__(*args, **kwargs)
     self.subtraj_length = subtraj_length
     self.gammas = self.discount * torch.ones(self.subtraj_length)
     discount_factors = torch.cumprod(self.gammas, dim=0)
     self.discount_factors = ptu.Variable(
         discount_factors.view(-1, 1),
         requires_grad=False,
     )
     self.replay_buffer = SplitReplayBuffer(
         SubtrajReplayBuffer(
             max_replay_buffer_size=self.replay_buffer_size,
             env=self.env,
             subtraj_length=self.subtraj_length,
         ),
         SubtrajReplayBuffer(
             max_replay_buffer_size=self.replay_buffer_size,
             env=self.env,
             subtraj_length=self.subtraj_length,
         ),
         fraction_paths_in_train=0.8,
     )
Beispiel #25
0
    def test(epoch):
        test_losses = []
        for state, action, q_target in test_loader:
            q_estim = eval_model(state, action)
            q_target = ptu.Variable(q_target, requires_grad=False)
            loss = loss_fnct(q_estim, q_target)
            test_losses.append(loss.data[0])

        line_logger.newline()
        print('Test Epoch: {0}. Loss: {1}'.format(epoch, np.mean(test_losses)))

        report.add_header("Epoch = {}".format(epoch))

        fig = visualize_model(q_function, "True Q Function")
        img = vu.save_image(fig)
        report.add_image(img, txt='True Q Function')

        fig = visualize_model(eval_model_np, "Estimated Q Function")
        img = vu.save_image(fig)
        report.add_image(img, txt='Estimated Q Function')

        report.new_row()
def simulate_policy(args):
    ptu.set_gpu_mode(True)
    model = pickle.load(open(args.file, "rb"))  # joblib.load(args.file)
    model.to(ptu.device)
    import ipdb
    ipdb.set_trace()
    samples = ptu.Variable(torch.randn(64, model.representation_size))
    samples = model.decode(samples).cpu()
    # for sample in samples:
    #     tensor = sample.data.view(64, model.input_channels, model.imsize, model.imsize)
    #     tensor = tensor.cpu()
    #     img = ptu.get_numpy(tensor)
    #     cv2.imshow('img', img.reshape(3, 84, 84).transpose())
    #     cv2.waitKey(1)

    tensor = samples.data.view(64, model.input_channels, model.imsize,
                               model.imsize)
    tensor = tensor.cpu()
    grid = make_grid(tensor, nrow=8)
    ndarr = grid.mul(255).clamp(0, 255).byte().permute(1, 2, 0).numpy()
    im = Image.fromarray(ndarr)
    im.show()
Beispiel #27
0
    def forward(self,
                obs,
                deterministic=False,
                return_log_prob=False,
                return_entropy=False,
                return_log_prob_of_mean=False):
        obs, taus = split_tau(obs)
        h = obs
        batch_size = h.size()[0]
        tau_vector = torch.zeros((batch_size, self.tau_vector_len)) + taus.data
        h = torch.cat((
            obs,
            ptu.Variable(tau_vector),
        ), dim=1)

        return super().forward(
            obs=h,
            deterministic=deterministic,
            return_log_prob=return_log_prob,
            return_entropy=return_entropy,
            return_log_prob_of_mean=return_log_prob_of_mean,
        )
Beispiel #28
0
    def forward(
            self,
            flat_obs,
            return_preactivations=False
    ):
        obs, taus = split_tau(flat_obs)
        h = obs
        batch_size = h.size()[0]
        y_binary = ptu.FloatTensor(batch_size, self.max_tau + 1)
        y_binary.zero_()
        t = taus.data.long()
        t = torch.clamp(t, min=0)
        y_binary.scatter_(1, t, 1)

        h = torch.cat((
            obs,
            ptu.Variable(y_binary),
        ), dim=1)

        return super().forward(
            h,
            return_preactivations=return_preactivations,
        )
Beispiel #29
0
    def forward(
        self,
        obs,
        deterministic=False,
        return_log_prob=False,
        return_entropy=False,
        return_log_prob_of_mean=False,
    ):
        obs, taus = split_tau(obs)
        batch_size = taus.size()[0]
        y_binary = make_binary_tensor(taus, len(self.max_tau), batch_size)
        h = torch.cat((
            obs,
            ptu.Variable(y_binary),
        ), dim=1)

        return super().forward(
            obs=h,
            deterministic=deterministic,
            return_log_prob=return_log_prob,
            return_entropy=return_entropy,
            return_log_prob_of_mean=return_log_prob_of_mean,
        )
Beispiel #30
0
 def eval_model_np(state, action):
     state = ptu.Variable(ptu.FloatTensor([[state]]), requires_grad=False)
     action = ptu.Variable(ptu.FloatTensor([[action]]), requires_grad=False)
     a, v = model(state, action)
     q = a + v
     return ptu.get_numpy(q)[0]