Example #1
0
    def _test_and_save(self, episode, test_game):
        self.training = False
        self.model.train(self.training)
        test_eps_variable = None
        total_reward = 0
        with torch.no_grad():
            test_lstm_hidden_vb = (Variable(
                torch.zeros(1, self.hidden_dim).type(self.dtype)),
                                   Variable(
                                       torch.zeros(1, self.hidden_dim).type(
                                           self.dtype)))
            current_experience = test_game.reset()
            test_game.visual()
            eps_count = 0
            test_actions = []
            test_done = False
            while eps_count < self.max_episode and not test_done:
                test_p_vb, test_v_vb, test_lstm_hidden_vb = self.model(
                    preprocess_state(current_experience.state1,
                                     self.dtype,
                                     is_volatile=False), test_lstm_hidden_vb,
                    test_eps_variable)
                test_action = test_p_vb.max(1)[1].data[0]
                test_actions.append(test_action)
                for _ in range(self.skip_frame):
                    current_experience = test_game.step(test_action)
                    test_game.visual()
                    total_reward = total_reward + current_experience.reward
                    if current_experience.terminal1:
                        test_done = True
                        break
                eps_count = eps_count + self.skip_frame

            self._save_model(episode, total_reward)
Example #2
0
def normal(x, mu, sigma):
    pi = np.array([math.pi])
    pi = torch.from_numpy(pi).float()
    pi = Variable(pi)
    a = (-1 * (x - mu).pow(2) / (2 * sigma)).exp()
    b = 1 / (2 * sigma * pi.expand_as(sigma)).sqrt()
    return a * b
Example #3
0
 def forward(self, x, lstm_hidden_vb=None, eps=None):
     x = x.view(x.size(0), 3, self.input_dims[1], self.input_dims[1])
     x = F.leaky_relu(self.down1(F.leaky_relu(self.conv1(x))))
     x = F.leaky_relu(self.down2(F.leaky_relu(self.conv2(x))))
     x = F.leaky_relu(self.down3(F.leaky_relu(self.conv3(x))))
     x = F.leaky_relu(self.down4(F.leaky_relu(self.conv4(x))))
     x = x.view(x.size(0), -1)
     hx, cx = self.lstm(x, lstm_hidden_vb)
     x = self.linear_encoder(hx)
     mu = self.linear_mu(hx)
     sigma = Variable(torch.zeros(mu.size(0), mu.size(1))-self.sig)
     z = self.sampler(mu, sigma, eps=eps)
     sigma_det = Variable(torch.zeros(mu.size(0), mu.size(1))-self.sig)
     x = self.sampler(x, sigma_det,eps=eps)
     x = F.leaky_relu(torch.cat([x,z], dim=1))
     p = self.policy_5(x)
     p = self.policy_6(p)
     v = self.value_5(x)
     return p, v, (hx, cx)
Example #4
0
 def _reset_lstm_hidden_vb_episode(self,
                                   training=True
                                   ):  # seq_len, batch_size, hidden_dim
     not_training = not training
     if not_training:
         with torch.no_grad():
             self.lstm_hidden_vb = (Variable(
                 torch.zeros(self.batch_size,
                             self.hidden_dim).type(self.dtype)),
                                    Variable(
                                        torch.zeros(self.batch_size,
                                                    self.hidden_dim).type(
                                                        self.dtype)))
             self.latent = Variable(
                 torch.zeros(self.batch_size, self.hidden_dim))
             self.eps_variable = None
     else:
         self.lstm_hidden_vb = (Variable(
             torch.zeros(self.batch_size,
                         self.hidden_dim).type(self.dtype)),
                                Variable(
                                    torch.zeros(self.batch_size,
                                                self.hidden_dim).type(
                                                    self.dtype)))
         self.latent = Variable(
             torch.zeros(self.batch_size, self.hidden_dim))
         self.eps_variable = None
Example #5
0
def preprocess_state(state, dtype, is_volatile=False):
    if isinstance(state, list):
        state_vb = []
        for i in range(len(state)):
            if is_volatile:
                with torch.no_grad():
                    state_vb.append(
                        Variable(
                            torch.from_numpy(
                                state[i]).unsqueeze(0).type(dtype)))
            else:
                state_vb.append(
                    Variable(
                        torch.from_numpy(state[i]).unsqueeze(0).type(dtype)))
    else:
        if is_volatile:
            with torch.no_grad():
                state_vb = Variable(
                    torch.from_numpy(state).unsqueeze(0).type(dtype))
        else:
            state_vb = Variable(
                torch.from_numpy(state).unsqueeze(0).type(dtype))
    return state_vb
Example #6
0
    def forward(self, x, z_prev, lstm_hidden_vb=None, eps=None):
        x = x.view(x.size(0), 3, self.input_dims[1], self.input_dims[1])
        x = F.leaky_relu(self.down1(F.leaky_relu(self.conv1(x))), inplace=True)
        x = F.leaky_relu(self.down2(F.leaky_relu(self.conv2(x))), inplace=True)
        x = F.leaky_relu(self.down3(F.leaky_relu(self.conv3(x))), inplace=True)
        x = F.leaky_relu(self.down4(F.leaky_relu(self.conv4(x))), inplace=True)
        x = x.view(x.size(0), -1)

        hx, cx = self.lstm(x, lstm_hidden_vb)

        if self.crelu:
            z_prev = self.crelu_z(z_prev)
        else:
            z_prev = F.leaky_relu(z_prev, inplace=True)

        mu_prior = self.prior_mu(z_prev)

        sigma_prior = Variable(
            torch.zeros(mu_prior.size(0), mu_prior.size(1)) - self.sig)

        x = self.linear_encoder(hx)
        mu = self.linear_mu(hx)
        sigma = self.linear_sigma(hx)
        self.x = Variable(x.data)
        z = self.sampler(mu, sigma, eps=eps)

        if self.crelu:
            x = self.crelu_x(torch.cat([x, z], dim=1))
        else:
            x = F.leaky_relu(torch.cat([x, z], dim=1), inplace=True)

        p = self.policy_5(x)
        p = self.policy_6(p)
        v = self.value_5(x)

        return p, v, z, (hx, cx), (mu, sigma), (mu_prior, sigma_prior)
Example #7
0
 def forward(self, input):
     return F.linear(
         input,
         self.weight + self.sigma_weight * Variable(self.epsilon_weight),
         self.bias + self.sigma_bias * Variable(self.epsilon_bias))
Example #8
0
    def _backward(self, sT_vb):
        self.optimizer.zero_grad()
        # preparation
        _, valueT_vb, _ = self.model(sT_vb, self.lstm_hidden_vb)
        for i in range(self.batch_size):
            if self.A3C_Experiences[i].terminal1[-1]:
                valueT_vb.data[i] = 0
        valueT_vb = Variable(valueT_vb.data)
        rollout_steps = [
            len(self.A3C_Experiences[i].reward) for i in range(self.batch_size)
        ]
        policy_vb = [
            self.A3C_Experiences[i].policy_vb for i in range(self.batch_size)
        ]
        action_batch_vb = [
            self.A3C_Experiences[i].action for i in range(self.batch_size)
        ]
        policy_log_vb = [[
            torch.log(policy_vb[i][j]) for j in range(len(policy_vb[i]))
        ] for i in range(len(policy_vb))]
        entropy_vb = [[
            -(policy_log_vb[i][j] * policy_vb[i][j]).sum(1)
            for j in range(len(policy_vb[i]))
        ] for i in range(len(policy_vb))]
        policy_log_vb = [[
            policy_log_vb[i][j].gather(
                1,
                Variable(action_batch_vb[i][j]).unsqueeze(0).detach())
            for j in range(len(action_batch_vb[i]))
        ] for i in range(len(action_batch_vb))]
        for i in range(self.batch_size):
            self.A3C_Experiences[i].value0_vb.append(
                Variable(valueT_vb.data[i]))
        gae_ts = torch.zeros(self.batch_size, 1)
        if self.gpu >= 0: gae_ts = gae_ts.cuda()

        # compute loss
        policy_loss_vb = [0. for i in range(self.batch_size)]
        value_loss_vb = [0. for i in range(self.batch_size)]
        loss_model_vb = 0
        for j in range(self.batch_size):
            for i in reversed(range(rollout_steps[j])):
                valueT_vb[j] = self.gamma * valueT_vb[
                    j] + self.A3C_Experiences[j].reward[i]
                advantage_vb = valueT_vb[j] - self.A3C_Experiences[
                    j].value0_vb[i]
                value_loss_vb[j] = value_loss_vb[j] + 0.5 * advantage_vb.pow(2)
                tderr_ts = self.A3C_Experiences[j].reward[
                    i] + self.gamma * self.A3C_Experiences[j].value0_vb[
                        i + 1].data - self.A3C_Experiences[j].value0_vb[i].data
                gae_ts[j] = gae_ts[j] * self.tau * self.gamma + tderr_ts
                policy_loss_vb[j] = policy_loss_vb[j] - (
                    policy_log_vb[j][i] * Variable(gae_ts[j]) +
                    self.beta * entropy_vb[j][i])

            loss_model_vb = loss_model_vb + (
                policy_loss_vb[j] +
                self.lam * value_loss_vb[j]) / rollout_steps[j]

        self.model.zero_grad()

        loss_model_vb.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_grad)

        p_loss_avg = 0
        v_loss_avg = 0
        loss_avg = loss_model_vb.data.cpu().numpy()
        for i in range(self.batch_size):
            p_loss_avg += policy_loss_vb[i].data.cpu().numpy(
            ) / self.batch_size
            v_loss_avg += value_loss_vb[i].data.cpu().numpy() / self.batch_size

        # log training stats
        self.p_loss_avg += p_loss_avg
        self.v_loss_avg += v_loss_avg
        self.loss_avg += loss_model_vb.data.cpu().numpy()
        self.loss_counter += 1

        self.logger.warning("Reporting       @ Step: " + str(self.train_step) +
                            " | Elapsed Time: " +
                            str(time.time() - self.start_time))
        self.logger.warning(
            "Iteration: {}; current p_loss: {}; average p_loss: {}".format(
                self.train_step, p_loss_avg,
                self.p_loss_avg / self.loss_counter))
        self.logger.warning(
            "Iteration: {}; current v_loss: {}; average v_loss: {}".format(
                self.train_step, v_loss_avg,
                self.v_loss_avg / self.loss_counter))
        self.logger.warning(
            "Iteration: {}; current loss  : {}; average loss  : {}".format(
                self.train_step, loss_avg, self.loss_avg / self.loss_counter))
Example #9
0
 def _reset_lstm_hidden_vb_rollout(self):
     self.lstm_hidden_vb = (Variable(self.lstm_hidden_vb[0].data),
                            Variable(self.lstm_hidden_vb[1].data))
     self.latent = Variable(self.latent.data)