Exemple #1
0
    def choose_action(self, state, last_action, hidden_in):
        state = torch.Tensor(state).float().unsqueeze(0).unsqueeze(0).to(
            self.device)
        last_action = torch.Tensor(last_action).unsqueeze(0).unsqueeze(0).to(
            self.device)
        mu, log_std, hidden_out = self.actor(state, last_action, hidden_in)

        std = torch.exp(log_std)
        m = Normal(mu, std)
        action_val = m.sample()
        action = torch.tanh(action_val).detach().cpu().numpy()
        return action[0][0], hidden_out
Exemple #2
0
    def evaluate(self, state, epsilon=1e-6):
        mean, log_std = self.forward(state)
        std = log_std.exp()
        normal = Normal(mean, std)
        noise = Normal(0, 1)

        z = noise.sample()
        action = torch.tanh(mean + std * z.to(self.device))
        log_prob = normal.log_prob(mean + std * z.to(self.device)) - torch.log(
            1 - action.pow(2) + epsilon)

        return action, log_prob
Exemple #3
0
    def forward(self, x, action_taken):
        policy = Normal(self.mu(x), self.log_std.exp())
        # Sample the action from the policy.
        pi = policy.sample()
        # Sum over the actions.
        logp_pi = policy.log_prob(pi).sum(dim=1)
        if action_taken is not None:
            logp = policy.log_prob(action_taken).sum(dim=1)
        else:
            logp = None

        return pi, logp, logp_pi
Exemple #4
0
def train_agent(episodes, seed, out_name):

    np.random.seed(seed)
    torch.manual_seed(seed)

    latest_rewards = deque(maxlen=20)
    track_rewards = deque(maxlen=episodes)

    with open('runs/{}_{}.csv'.format(out_name, seed), 'w') as f:
        f.write('{}_{}\n'.format(out_name, seed))

    pbar = tqdm(total=episodes)

    for episode in range(episodes):

        s = env.reset()
        done = False
        states, actions, rewards, next_states = [], [], [], []
        ep_rewards = 0.
        while not done:

            with torch.no_grad():
                mean, std = policy(torch.tensor(s).float().reshape(1, -1))
                dist = Normal(mean, std)
                a = dist.sample().numpy().flatten()

            ns, r, done, _ = env.step(a * max_action)
            states.append(s)
            rewards.append(r)
            actions.append(a)
            next_states.append(ns)

            s = ns
            ep_rewards += r

            track_rewards.append(r)

        rewards = (np.array(rewards) -
                   np.mean(track_rewards)) / np.std(track_rewards)
        rewards_to_go = discount_rewards(rewards)
        loader = get_loader(
            XPDataset(states, rewards_to_go, actions, next_states))
        train_data = train(loader)
        latest_rewards.append(ep_rewards)

        with open('runs/{}_{}.csv'.format(out_name, seed), 'a') as f:
            f.write('{}\n'.format(ep_rewards))
        pbar.update(1)
        if episodes % 10 == 0:
            pbar.set_description('Mean R{:.2f}'.format(
                np.mean(latest_rewards)))

    pbar.close()
Exemple #5
0
class PolicyNetwork(nn.Module):
    def __init__(self,
                 observation_shape,
                 goal_shape,
                 output_shape,
                 action_ranges,
                 include_conv=True):
        super(PolicyNetwork, self).__init__()
        self.action_ranges = action_ranges
        self.include_conv = include_conv
        if include_conv:
            self.conv_layers = ConvModule()

        self.layer_obs = nn.Linear(2048 if include_conv else observation_shape,
                                   200)
        self.layer_goal = nn.Linear(goal_shape, 200)
        self.layer1 = nn.Linear(400, 256)
        self.layer2 = nn.Linear(256, 256)
        self.layer3 = nn.Linear(256, output_shape)

        self.action_scale = (action_ranges[1] - action_ranges[0]) / 2
        self.action_bias = (action_ranges[1] + action_ranges[0]) / 2

        self.noise = Normal(0, 3 * self.action_scale)

    def forward(self, observation, goals):
        if self.include_conv:
            observation = self.conv_layers(observation)
        processed_obs = F.relu(self.layer_obs(observation))
        processed_goal = F.relu(self.layer_goal(goals))
        if len(processed_goal.shape) < len(processed_obs.shape):
            processed_goal = processed_goal[np.newaxis, :]

        out = torch.cat([processed_obs, processed_goal], dim=-1)
        out = F.relu(self.layer1(out))
        out = F.leaky_relu(self.layer2(out))
        action = self.layer3(out)

        return action

    def sample(self, observations, goals, noise=True, evaluate=False):
        action = self.forward(observations, goals)

        if noise:
            action += self.noise.sample(sample_shape=action.shape).to(
                action.device)
            action = torch.tanh(action) * self.action_scale + self.action_bias
        elif evaluate:
            action = torch.tanh(action) * self.action_scale + self.action_bias
        else:
            action = torch.tanh(action)

        return action
Exemple #6
0
    def get_action(self, state, eval_deterministic=False):

        mu, sig = self.forward(state)
        if eval_deterministic:
            action = mu
        else:
            gauss = Normal(loc=mu, scale=sig)
            action = gauss.sample()
            action.detach()

        action = self.max_action * th.tanh(action / self.max_action)
        return action
Exemple #7
0
    def forward(self, state: torch.Tensor):
        x = torch.tanh(self.hidden_one(state))
        x = torch.tanh(self.hidden_two(x))

        mu = torch.tanh(self.mu_layer(x))
        log_std = torch.tanh(self.log_std_layer(x))

        std = torch.exp(log_std)
        dist = Normal(mu, std)
        action = dist.sample()

        return action, dist, mu, std
    def evaluate(self, state, epsilon=1e-6):
        mean, log_std = self.forward(state)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)

        log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon)
        log_prob = log_prob.sum(-1, keepdim=True)

        return action, log_prob, z, mean, log_std
def trainJump(save, save_as=None, curr_checkpoint=None):
    model.train()
    global variance
    for episode in range(num_episodes):
        print("-----------------------------------------")
        print("Episode:", episode)

        # Get state
        state = get_distance()
        prev_score = getScore()
        print("Distance:", state)
        state = np.array([state])
        state = torch.from_numpy(state)
        state = state.float()

        # Calculate mean and variance
        mean = model(state)
        variance = final_variance + (initial_variance - final_variance) * \
                        math.exp(-1. * episode / variance_decay)
        print("Mean:", float(mean), "Deviation:", float(variance))

        # Construct normal distribution based off of mean and variance and sample from it
        m = Normal(mean, variance)
        action = m.sample()

        # Perform action
        print("Action:", action)
        os.system("adb shell input swipe 500 500 500 500 " + str(int(action)))

        # Get reward and optimize model
        time.sleep(0.5)
        reward = getReward(prev_score)
        if reward >= 2:
            reward = 10
        elif reward == 1:
            reward = 1
        elif reward < 0:
            onDeath()
        print("Reward:", reward)
        loss = -m.log_prob(action) * reward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if save:
            if (episode + 1) % 501 == 0:
                save_file = {
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                }
                file_name = save_as + str((episode // 1000) +
                                          curr_checkpoint) + ".pth"
                torch.save(save_file, file_name)
Exemple #10
0
    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0)
        mean, log_std = self.forward(state)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)
        #action = z.detach().numpy()

        action = action.detach().numpy()
        return action[0]
Exemple #11
0
    def get_action(self, state, deterministic):
        state = torch.FloatTensor(state).unsqueeze(0).cuda()
        mean, log_std = self.forward(state)
        std = log_std.exp()

        normal = Normal(0, 1)
        z = normal.sample().cuda()
        action = self.action_range * torch.tanh(mean + std * z)

        action = self.action_range * torch.tanh(mean).detach().cpu().numpy()[0] if deterministic else \
        action.detach().cpu().numpy()[0]
        return action
Exemple #12
0
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        self.counter += 1
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        # print('self.param_groups: ', self.param_groups)
        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']
            addnoise = group['addnoise']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad
                if weight_decay != 0:
                    d_p = d_p.add(p, alpha=weight_decay)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
                    if nesterov:
                        d_p = d_p.add(buf, alpha=momentum)
                    else:
                        d_p = buf

                if addnoise:
                    size = d_p.size()
                    langevin_noise = Normal(
                        torch.zeros(size),
                        torch.ones(size) / np.sqrt(group['lr'])
                    )
                    # if self.counter == 1:
                    #     print('generate noise from mean 0 and std {0:.3f}'.format(np.sqrt(group['lr'])))
                    p.add_(d_p + langevin_noise.sample().cuda(), alpha=-group['lr'])
                else:
                    p.add_(d_p, alpha=-group['lr'])

        return loss
Exemple #13
0
    def step(self, lr=None, add_noise=False):
        """
        Performs a single optimization step.
        """
        loss = None

        for group in self.param_groups:
            if lr:
                group['lr'] = lr
            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['square_avg'] = torch.zeros_like(p.data)
                    if group['centered']:
                        state['grad_avg'] = torch.zeros_like(p.data)

                square_avg = state['square_avg']
                alpha = group['alpha']
                state['step'] += 1

                # sqavg x alpha + (1-alph) sqavg *(elemwise) sqavg
                square_avg.mul_(alpha).addcmul_(1 - alpha, d_p, d_p)

                if group['centered']:
                    grad_avg = state['grad_avg']
                    grad_avg.mul_(alpha).add_(1 - alpha, d_p)
                    avg = square_avg.cmul(-1, grad_avg,
                                          grad_avg).sqrt().add_(group['eps'])
                else:
                    avg = square_avg.sqrt().add_(group['eps'])

                if group['addnoise']:
                    size = d_p.size()
                    langevin_noise = Normal(
                        #torch.zeros(size).cuda(),
                        torch.zeros(size).cuda(),
                        (torch.ones(size).cuda()).mul_(
                            group['lr']).div_(avg).sqrt())
                    p.data.add_(-group['lr'],
                                d_p.div_(avg) + langevin_noise.sample())
                    #print ("yes, adding noise")
                else:
                    #p.data.add_(-group['lr'], d_p.div_(avg))
                    p.data.addcdiv_(-group['lr'], d_p, avg)

        return loss
Exemple #14
0
def sample_from_mix_gaussian(y, log_scale_min=-7.0):
    """
    Sample from (discretized) mixture of gaussian distributions
    Args:
        y (Tensor): B x C x T
        log_scale_min (float): Log scale minimum value
    Returns:
        Tensor: sample in range of [-1, 1].
    """
    C = y.size(1)
    if C == 2:
        nr_mix = 1
    else:
        assert y.size(1) % 3 == 0
        nr_mix = y.size(1) // 3

    # B x T x C
    y = y.transpose(1, 2)

    if C == 2:
        logit_probs = None
    else:
        logit_probs = y[:, :, :nr_mix]

    if nr_mix > 1:
        # sample mixture indicator from softmax
        temp = logit_probs.data.new(logit_probs.size()).uniform_(
            1e-5, 1.0 - 1e-5)
        temp = logit_probs.data - torch.log(-torch.log(temp))
        _, argmax = temp.max(dim=-1)

        # (B, T) -> (B, T, nr_mix)
        one_hot = to_one_hot(argmax, nr_mix)

        # Select means and log scales
        means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1)
        log_scales = torch.sum(y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot,
                               dim=-1)
    else:
        if C == 2:
            means, log_scales = y[:, :, 0], y[:, :, 1]
        elif C == 3:
            means, log_scales = y[:, :, 1], y[:, :, 2]
        else:
            assert False, "shouldn't happen"

    scales = torch.exp(log_scales)
    dist = Normal(loc=means, scale=scales)
    x = dist.sample()

    x = torch.clamp(x, min=-1.0, max=1.0)
    return x
    def get_action(self, state, deterministic):
        with torch.no_grad():
            state = torch.from_numpy(state.astype(np.float32)).to(device)
            mean, log_std = self.forward(state)
            std = log_std.exp()

            normal = Normal(0, 1)
            z = normal.sample(std.size()).to(device)
            action = self.action_range * torch.tanh(mean + std * z)

            action = ((self.action_range * torch.tanh(mean))
                      if deterministic else action).cpu().numpy()
            return action
Exemple #16
0
    def choose_action(self, state):
        action, _ = super().choose_action(state)

        # std 可為負的,Normal 有處理
        m = Normal(action[:, 0], action[:, 1])
        a = m.sample()
        a = a.numpy()
        a = np.clip(a, -1, 1)
        a = a * self.max_actions

        action = action.cpu().data.numpy()

        return action, a
    def get_action(self, state, device):
        """ Method that uses PolicyNetwork weights to determine what action to take by agent
        based on current environment state. """
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        mean, log_std = self.forward(state)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)

        action = action.detach().cpu().numpy()
        return action[0]
    def get_action(self, state):
        """
        Returns an action given a state
        """
        mean, log_std = self.forward(state)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)

        action = action.detach().cpu().numpy()
        return action[0]
    def get_action(self, state, test=False):

        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        mean, log_std = self.forward(state)
        std = log_std.exp()
        if test: std = 0

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)

        action = action.detach().cpu().numpy()
        return action[0]
Exemple #20
0
    def act(self, state):  # this is like the inference state
        x = torch.from_numpy(state).float()
        pdparam = self.forward(x)

        # run network
        v_loc, v_scale = pdparam[0], abs(pdparam[1])
        turn_loc, turn_scale = pdparam[2], abs(pdparam[3])
        v_pd = Normal(loc=v_loc, scale=v_scale)
        turn_pd = Normal(loc=turn_loc, scale=turn_scale)

        # sample velocity and turn angle
        new_v = v_pd.sample()
        new_turn = turn_pd.sample()

        v_prob = v_pd.log_prob(
            new_v)  # a perfect certainty will have 0 log prob
        turn_prob = turn_pd.log_prob(new_turn)

        self.v_log_probs.append(v_prob)
        self.turn_log_probs.append(turn_prob)

        return new_v.item(), new_turn.item()
Exemple #21
0
 def select_action(self, state):
     state = torch.FloatTensor(state).cuda().unsqueeze(0)
     # print('state : ', state)
     with torch.no_grad():
         mean, std = self.mlp_policy(state)
     # print('mean, std', mean.shape, std.shape, mean, std)
     dist = Normal(mean, std)
     action = dist.sample()
     # action_log_prob = dist.log_prob(action)
     action = action.clamp(-1, 1)
     # print(action_log_prob.shape, action.shape)
     # print(action)
     return action.cpu().squeeze().numpy()
Exemple #22
0
    def sampleAction(self, obs):
        """
        Sample action from Normal or Categorical distribution
        (continuous vs discrete actions) and return the log probability
        + policy parameters for regularization
        :param obs: (th.Tensor)
        :return: (tuple(th.Tensor))
        """
        if self.continuous_actions:
            mean_policy, log_std = self.policy_net(obs)
            # Clip the value of the standard deviation
            log_std = th.clamp(log_std, self.log_std_min, self.log_std_max)
            std = th.exp(log_std)
            distribution = Normal(mean_policy, std)
            # Used only during testing
            if self.deterministic:
                pre_tanh_value = mean_policy
            else:
                pre_tanh_value = distribution.sample().detach()
            # Squash the value
            action = F.tanh(pre_tanh_value)
            # Correction to the log prob because of the squashing function
            epsilon = 1e-6
            log_pi = distribution.log_prob(pre_tanh_value) - th.log(1 - action ** 2 + epsilon)
            log_pi = log_pi.sum(-1, keepdim=True)
        else:
            mean_policy, log_std = self.policy_net(obs)
            # Here mean policy is the energy of each action
            distribution = Categorical(logits=mean_policy)
            if self.deterministic:
                action = th.argmax(F.softmax(mean_policy, dim=1), dim=1)
            else:
                action = distribution.sample().detach()
            # Only valid for continuous actions
            pre_tanh_value = action * 0.0
            log_std = log_std * 0.0
            log_pi = distribution.log_prob(action).unsqueeze(1)

        return action, log_pi, pre_tanh_value, mean_policy, log_std
 def get_action(self, state, last_action, hidden_in, noise_scale=0.0):
     '''
     select action for sampling, no gradients flow, noisy action, return .cpu
     '''
     state = torch.FloatTensor(state).unsqueeze(0).unsqueeze(0).cuda() # increase 2 dims to match with training data
     last_action = torch.FloatTensor(last_action).unsqueeze(0).unsqueeze(0).cuda()
     action, hidden_out = self.forward(state, last_action, hidden_in)
     action = action.detach().cpu().numpy()[0][0]
     ''' add noise '''
     normal = Normal(0, 1)
     noise = noise_scale * normal.sample(action.shape)
     action=self.action_range*action + noise.numpy()
     return action , hidden_out
    def get_action(self, state, last_action, hidden_in, deterministic=True):
        state = torch.FloatTensor(state).unsqueeze(0).unsqueeze(0).cuda()  # increase 2 dims to match with training data
        last_action = torch.FloatTensor(last_action).unsqueeze(0).unsqueeze(0).cuda()
        mean, log_std, hidden_out = self.forward(state, last_action, hidden_in)
        std = log_std.exp()
        
        normal = Normal(0, 1)
        z = normal.sample().cuda()
        action = self.action_range * torch.tanh(mean + std * z)

        action = self.action_range * torch.tanh(mean).detach().cpu().numpy() if deterministic else \
        action.detach().cpu().numpy()
        return action[0][0], hidden_out
Exemple #25
0
 def get_action(self, state):
     """
     returns the action based on a squashed gaussian policy. That means the samples are obtained according to:
     a(s,e)= tanh(mu(s)+sigma(s)+e)
     """
     #state = torch.FloatTensor(state).to(device) #.unsqzeeze(0)
     mu, log_std = self.forward(state)
     std = log_std.exp()
     dist = Normal(0, 1)
     e = dist.sample().to(device)
     action = torch.tanh(mu + e * std).cpu()
     #action = torch.clamp(action*action_high, action_low, action_high)
     return action[0]
    def get_action(self, state, noise_scale=0.0):
        '''
        select action for sampling, no gradients flow, noisy action, return .cpu
        '''
        state = torch.FloatTensor(state).unsqueeze(0).cuda() # state dim: (N, dim of state)
        action = self.forward(state)
        action = action.detach().cpu().numpy()[0] 
        ''' add noise '''
        normal = Normal(0, 1)
        noise = noise_scale * normal.sample(action.shape)
        action=self.action_range*action + noise.numpy()

        return action
    def evaluate(self, state, eval_noise_scale, epsilon=1e-6):
        '''
        generate action with state as input wrt the policy network, for calculating gradients
        '''
        action = self.forward(state)
        ''' add noise '''
        normal = Normal(0, 1)
        eval_noise_clip = 2 * eval_noise_scale
        noise = normal.sample(action.shape) * eval_noise_scale
        noise = torch.clamp(noise, -eval_noise_clip, eval_noise_clip)
        action = self.action_range * action + noise.cuda()

        return action
    def act(self, state, std_scale, memory):
        state = torch.from_numpy(state).float().to(device)
        action_probs = self.action_layer(state)
        dist = Normal(loc=action_probs, scale=std_scale)
        action = dist.sample()

        action = action

        memory.states.append(state)
        memory.actions.append(action)
        memory.logprobs.append(dist.log_prob(action))

        return action.detach().numpy()
    def forward(self, x):
        """
        Forward method implementation.
        x (torch.Tensor)
        :return: action (torch.Tensor) and dist
        """
        mu, _, std = self.get_dist_params(x)

        # get normal distribution and action
        dist = Normal(mu, std)
        action = dist.sample()

        return action, dist
Exemple #30
0
 def evaluate(self, state):
     batch_mu, batch_log_sigma = self.policy_net(state)
     batch_sigma = torch.exp(batch_log_sigma)
     dist = Normal(batch_mu, batch_sigma)
     noise = Normal(0, 1)
     z = noise.sample()
     #获取动作
     action = torch.tanh(batch_mu + batch_sigma * z.to(self.device))
     # 后半部分是修正项
     log_prob = dist.log_prob(batch_mu +
                              batch_sigma * z.to(self.device)) - torch.log(
                                  1 - action.pow(2) + self.min_Val)
     return action * self.max_action, log_prob
Exemple #31
0
D.train()
for epoch in range(NUM_EPOCHS):
    total_gen_loss = 0
    total_disc_loss = 0
    total = 0
    for img, label in train_loader:
        if img.size(0) < BATCH_SIZE: continue
        img = V(img).cuda()
        # Grad discriminator real: -E[log(D(x))]
        optim_disc.zero_grad()
        optim_gen.zero_grad()
        d = D(img)
        loss_a = -d.log().mean()
        loss_a.backward()
        # Grad discriminator fake: -E[log(1 - D(G(z)) )]
        seed = seed_distribution.sample()
        x_fake = G(seed)
        d = D(x_fake.detach())
        loss_b = -(1 - d + 1e-10).log().mean()
        loss_b.backward()
        optim_disc.step()
        total_disc_loss += loss_a.item() + loss_b.item()
        # Grad generator: E[log(1 - D(G(z)))]
        # optim_disc.zero_grad()
        d = D(x_fake) # no detach here
        # loss_c = (1 - d + 1e-10).log().mean()
        loss_c = -(d + 1e-10).log().mean()
        loss_c.backward()
        optim_gen.step()
        total_gen_loss += loss_c.item()
        total += 1