def choose_action(self, state, last_action, hidden_in): state = torch.Tensor(state).float().unsqueeze(0).unsqueeze(0).to( self.device) last_action = torch.Tensor(last_action).unsqueeze(0).unsqueeze(0).to( self.device) mu, log_std, hidden_out = self.actor(state, last_action, hidden_in) std = torch.exp(log_std) m = Normal(mu, std) action_val = m.sample() action = torch.tanh(action_val).detach().cpu().numpy() return action[0][0], hidden_out
def evaluate(self, state, epsilon=1e-6): mean, log_std = self.forward(state) std = log_std.exp() normal = Normal(mean, std) noise = Normal(0, 1) z = noise.sample() action = torch.tanh(mean + std * z.to(self.device)) log_prob = normal.log_prob(mean + std * z.to(self.device)) - torch.log( 1 - action.pow(2) + epsilon) return action, log_prob
def forward(self, x, action_taken): policy = Normal(self.mu(x), self.log_std.exp()) # Sample the action from the policy. pi = policy.sample() # Sum over the actions. logp_pi = policy.log_prob(pi).sum(dim=1) if action_taken is not None: logp = policy.log_prob(action_taken).sum(dim=1) else: logp = None return pi, logp, logp_pi
def train_agent(episodes, seed, out_name): np.random.seed(seed) torch.manual_seed(seed) latest_rewards = deque(maxlen=20) track_rewards = deque(maxlen=episodes) with open('runs/{}_{}.csv'.format(out_name, seed), 'w') as f: f.write('{}_{}\n'.format(out_name, seed)) pbar = tqdm(total=episodes) for episode in range(episodes): s = env.reset() done = False states, actions, rewards, next_states = [], [], [], [] ep_rewards = 0. while not done: with torch.no_grad(): mean, std = policy(torch.tensor(s).float().reshape(1, -1)) dist = Normal(mean, std) a = dist.sample().numpy().flatten() ns, r, done, _ = env.step(a * max_action) states.append(s) rewards.append(r) actions.append(a) next_states.append(ns) s = ns ep_rewards += r track_rewards.append(r) rewards = (np.array(rewards) - np.mean(track_rewards)) / np.std(track_rewards) rewards_to_go = discount_rewards(rewards) loader = get_loader( XPDataset(states, rewards_to_go, actions, next_states)) train_data = train(loader) latest_rewards.append(ep_rewards) with open('runs/{}_{}.csv'.format(out_name, seed), 'a') as f: f.write('{}\n'.format(ep_rewards)) pbar.update(1) if episodes % 10 == 0: pbar.set_description('Mean R{:.2f}'.format( np.mean(latest_rewards))) pbar.close()
class PolicyNetwork(nn.Module): def __init__(self, observation_shape, goal_shape, output_shape, action_ranges, include_conv=True): super(PolicyNetwork, self).__init__() self.action_ranges = action_ranges self.include_conv = include_conv if include_conv: self.conv_layers = ConvModule() self.layer_obs = nn.Linear(2048 if include_conv else observation_shape, 200) self.layer_goal = nn.Linear(goal_shape, 200) self.layer1 = nn.Linear(400, 256) self.layer2 = nn.Linear(256, 256) self.layer3 = nn.Linear(256, output_shape) self.action_scale = (action_ranges[1] - action_ranges[0]) / 2 self.action_bias = (action_ranges[1] + action_ranges[0]) / 2 self.noise = Normal(0, 3 * self.action_scale) def forward(self, observation, goals): if self.include_conv: observation = self.conv_layers(observation) processed_obs = F.relu(self.layer_obs(observation)) processed_goal = F.relu(self.layer_goal(goals)) if len(processed_goal.shape) < len(processed_obs.shape): processed_goal = processed_goal[np.newaxis, :] out = torch.cat([processed_obs, processed_goal], dim=-1) out = F.relu(self.layer1(out)) out = F.leaky_relu(self.layer2(out)) action = self.layer3(out) return action def sample(self, observations, goals, noise=True, evaluate=False): action = self.forward(observations, goals) if noise: action += self.noise.sample(sample_shape=action.shape).to( action.device) action = torch.tanh(action) * self.action_scale + self.action_bias elif evaluate: action = torch.tanh(action) * self.action_scale + self.action_bias else: action = torch.tanh(action) return action
def get_action(self, state, eval_deterministic=False): mu, sig = self.forward(state) if eval_deterministic: action = mu else: gauss = Normal(loc=mu, scale=sig) action = gauss.sample() action.detach() action = self.max_action * th.tanh(action / self.max_action) return action
def forward(self, state: torch.Tensor): x = torch.tanh(self.hidden_one(state)) x = torch.tanh(self.hidden_two(x)) mu = torch.tanh(self.mu_layer(x)) log_std = torch.tanh(self.log_std_layer(x)) std = torch.exp(log_std) dist = Normal(mu, std) action = dist.sample() return action, dist, mu, std
def evaluate(self, state, epsilon=1e-6): mean, log_std = self.forward(state) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon) log_prob = log_prob.sum(-1, keepdim=True) return action, log_prob, z, mean, log_std
def trainJump(save, save_as=None, curr_checkpoint=None): model.train() global variance for episode in range(num_episodes): print("-----------------------------------------") print("Episode:", episode) # Get state state = get_distance() prev_score = getScore() print("Distance:", state) state = np.array([state]) state = torch.from_numpy(state) state = state.float() # Calculate mean and variance mean = model(state) variance = final_variance + (initial_variance - final_variance) * \ math.exp(-1. * episode / variance_decay) print("Mean:", float(mean), "Deviation:", float(variance)) # Construct normal distribution based off of mean and variance and sample from it m = Normal(mean, variance) action = m.sample() # Perform action print("Action:", action) os.system("adb shell input swipe 500 500 500 500 " + str(int(action))) # Get reward and optimize model time.sleep(0.5) reward = getReward(prev_score) if reward >= 2: reward = 10 elif reward == 1: reward = 1 elif reward < 0: onDeath() print("Reward:", reward) loss = -m.log_prob(action) * reward optimizer.zero_grad() loss.backward() optimizer.step() if save: if (episode + 1) % 501 == 0: save_file = { 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), } file_name = save_as + str((episode // 1000) + curr_checkpoint) + ".pth" torch.save(save_file, file_name)
def get_action(self, state): state = torch.FloatTensor(state).unsqueeze(0) mean, log_std = self.forward(state) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) #action = z.detach().numpy() action = action.detach().numpy() return action[0]
def get_action(self, state, deterministic): state = torch.FloatTensor(state).unsqueeze(0).cuda() mean, log_std = self.forward(state) std = log_std.exp() normal = Normal(0, 1) z = normal.sample().cuda() action = self.action_range * torch.tanh(mean + std * z) action = self.action_range * torch.tanh(mean).detach().cpu().numpy()[0] if deterministic else \ action.detach().cpu().numpy()[0] return action
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ self.counter += 1 loss = None if closure is not None: with torch.enable_grad(): loss = closure() # print('self.param_groups: ', self.param_groups) for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] addnoise = group['addnoise'] for p in group['params']: if p.grad is None: continue d_p = p.grad if weight_decay != 0: d_p = d_p.add(p, alpha=weight_decay) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.clone(d_p).detach() else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(d_p, alpha=1 - dampening) if nesterov: d_p = d_p.add(buf, alpha=momentum) else: d_p = buf if addnoise: size = d_p.size() langevin_noise = Normal( torch.zeros(size), torch.ones(size) / np.sqrt(group['lr']) ) # if self.counter == 1: # print('generate noise from mean 0 and std {0:.3f}'.format(np.sqrt(group['lr']))) p.add_(d_p + langevin_noise.sample().cuda(), alpha=-group['lr']) else: p.add_(d_p, alpha=-group['lr']) return loss
def step(self, lr=None, add_noise=False): """ Performs a single optimization step. """ loss = None for group in self.param_groups: if lr: group['lr'] = lr for p in group['params']: if p.grad is None: continue d_p = p.grad.data state = self.state[p] if len(state) == 0: state['step'] = 0 state['square_avg'] = torch.zeros_like(p.data) if group['centered']: state['grad_avg'] = torch.zeros_like(p.data) square_avg = state['square_avg'] alpha = group['alpha'] state['step'] += 1 # sqavg x alpha + (1-alph) sqavg *(elemwise) sqavg square_avg.mul_(alpha).addcmul_(1 - alpha, d_p, d_p) if group['centered']: grad_avg = state['grad_avg'] grad_avg.mul_(alpha).add_(1 - alpha, d_p) avg = square_avg.cmul(-1, grad_avg, grad_avg).sqrt().add_(group['eps']) else: avg = square_avg.sqrt().add_(group['eps']) if group['addnoise']: size = d_p.size() langevin_noise = Normal( #torch.zeros(size).cuda(), torch.zeros(size).cuda(), (torch.ones(size).cuda()).mul_( group['lr']).div_(avg).sqrt()) p.data.add_(-group['lr'], d_p.div_(avg) + langevin_noise.sample()) #print ("yes, adding noise") else: #p.data.add_(-group['lr'], d_p.div_(avg)) p.data.addcdiv_(-group['lr'], d_p, avg) return loss
def sample_from_mix_gaussian(y, log_scale_min=-7.0): """ Sample from (discretized) mixture of gaussian distributions Args: y (Tensor): B x C x T log_scale_min (float): Log scale minimum value Returns: Tensor: sample in range of [-1, 1]. """ C = y.size(1) if C == 2: nr_mix = 1 else: assert y.size(1) % 3 == 0 nr_mix = y.size(1) // 3 # B x T x C y = y.transpose(1, 2) if C == 2: logit_probs = None else: logit_probs = y[:, :, :nr_mix] if nr_mix > 1: # sample mixture indicator from softmax temp = logit_probs.data.new(logit_probs.size()).uniform_( 1e-5, 1.0 - 1e-5) temp = logit_probs.data - torch.log(-torch.log(temp)) _, argmax = temp.max(dim=-1) # (B, T) -> (B, T, nr_mix) one_hot = to_one_hot(argmax, nr_mix) # Select means and log scales means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1) log_scales = torch.sum(y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1) else: if C == 2: means, log_scales = y[:, :, 0], y[:, :, 1] elif C == 3: means, log_scales = y[:, :, 1], y[:, :, 2] else: assert False, "shouldn't happen" scales = torch.exp(log_scales) dist = Normal(loc=means, scale=scales) x = dist.sample() x = torch.clamp(x, min=-1.0, max=1.0) return x
def get_action(self, state, deterministic): with torch.no_grad(): state = torch.from_numpy(state.astype(np.float32)).to(device) mean, log_std = self.forward(state) std = log_std.exp() normal = Normal(0, 1) z = normal.sample(std.size()).to(device) action = self.action_range * torch.tanh(mean + std * z) action = ((self.action_range * torch.tanh(mean)) if deterministic else action).cpu().numpy() return action
def choose_action(self, state): action, _ = super().choose_action(state) # std 可為負的,Normal 有處理 m = Normal(action[:, 0], action[:, 1]) a = m.sample() a = a.numpy() a = np.clip(a, -1, 1) a = a * self.max_actions action = action.cpu().data.numpy() return action, a
def get_action(self, state, device): """ Method that uses PolicyNetwork weights to determine what action to take by agent based on current environment state. """ state = torch.FloatTensor(state).unsqueeze(0).to(device) mean, log_std = self.forward(state) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.detach().cpu().numpy() return action[0]
def get_action(self, state): """ Returns an action given a state """ mean, log_std = self.forward(state) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.detach().cpu().numpy() return action[0]
def get_action(self, state, test=False): state = torch.FloatTensor(state).unsqueeze(0).to(device) mean, log_std = self.forward(state) std = log_std.exp() if test: std = 0 normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.detach().cpu().numpy() return action[0]
def act(self, state): # this is like the inference state x = torch.from_numpy(state).float() pdparam = self.forward(x) # run network v_loc, v_scale = pdparam[0], abs(pdparam[1]) turn_loc, turn_scale = pdparam[2], abs(pdparam[3]) v_pd = Normal(loc=v_loc, scale=v_scale) turn_pd = Normal(loc=turn_loc, scale=turn_scale) # sample velocity and turn angle new_v = v_pd.sample() new_turn = turn_pd.sample() v_prob = v_pd.log_prob( new_v) # a perfect certainty will have 0 log prob turn_prob = turn_pd.log_prob(new_turn) self.v_log_probs.append(v_prob) self.turn_log_probs.append(turn_prob) return new_v.item(), new_turn.item()
def select_action(self, state): state = torch.FloatTensor(state).cuda().unsqueeze(0) # print('state : ', state) with torch.no_grad(): mean, std = self.mlp_policy(state) # print('mean, std', mean.shape, std.shape, mean, std) dist = Normal(mean, std) action = dist.sample() # action_log_prob = dist.log_prob(action) action = action.clamp(-1, 1) # print(action_log_prob.shape, action.shape) # print(action) return action.cpu().squeeze().numpy()
def sampleAction(self, obs): """ Sample action from Normal or Categorical distribution (continuous vs discrete actions) and return the log probability + policy parameters for regularization :param obs: (th.Tensor) :return: (tuple(th.Tensor)) """ if self.continuous_actions: mean_policy, log_std = self.policy_net(obs) # Clip the value of the standard deviation log_std = th.clamp(log_std, self.log_std_min, self.log_std_max) std = th.exp(log_std) distribution = Normal(mean_policy, std) # Used only during testing if self.deterministic: pre_tanh_value = mean_policy else: pre_tanh_value = distribution.sample().detach() # Squash the value action = F.tanh(pre_tanh_value) # Correction to the log prob because of the squashing function epsilon = 1e-6 log_pi = distribution.log_prob(pre_tanh_value) - th.log(1 - action ** 2 + epsilon) log_pi = log_pi.sum(-1, keepdim=True) else: mean_policy, log_std = self.policy_net(obs) # Here mean policy is the energy of each action distribution = Categorical(logits=mean_policy) if self.deterministic: action = th.argmax(F.softmax(mean_policy, dim=1), dim=1) else: action = distribution.sample().detach() # Only valid for continuous actions pre_tanh_value = action * 0.0 log_std = log_std * 0.0 log_pi = distribution.log_prob(action).unsqueeze(1) return action, log_pi, pre_tanh_value, mean_policy, log_std
def get_action(self, state, last_action, hidden_in, noise_scale=0.0): ''' select action for sampling, no gradients flow, noisy action, return .cpu ''' state = torch.FloatTensor(state).unsqueeze(0).unsqueeze(0).cuda() # increase 2 dims to match with training data last_action = torch.FloatTensor(last_action).unsqueeze(0).unsqueeze(0).cuda() action, hidden_out = self.forward(state, last_action, hidden_in) action = action.detach().cpu().numpy()[0][0] ''' add noise ''' normal = Normal(0, 1) noise = noise_scale * normal.sample(action.shape) action=self.action_range*action + noise.numpy() return action , hidden_out
def get_action(self, state, last_action, hidden_in, deterministic=True): state = torch.FloatTensor(state).unsqueeze(0).unsqueeze(0).cuda() # increase 2 dims to match with training data last_action = torch.FloatTensor(last_action).unsqueeze(0).unsqueeze(0).cuda() mean, log_std, hidden_out = self.forward(state, last_action, hidden_in) std = log_std.exp() normal = Normal(0, 1) z = normal.sample().cuda() action = self.action_range * torch.tanh(mean + std * z) action = self.action_range * torch.tanh(mean).detach().cpu().numpy() if deterministic else \ action.detach().cpu().numpy() return action[0][0], hidden_out
def get_action(self, state): """ returns the action based on a squashed gaussian policy. That means the samples are obtained according to: a(s,e)= tanh(mu(s)+sigma(s)+e) """ #state = torch.FloatTensor(state).to(device) #.unsqzeeze(0) mu, log_std = self.forward(state) std = log_std.exp() dist = Normal(0, 1) e = dist.sample().to(device) action = torch.tanh(mu + e * std).cpu() #action = torch.clamp(action*action_high, action_low, action_high) return action[0]
def get_action(self, state, noise_scale=0.0): ''' select action for sampling, no gradients flow, noisy action, return .cpu ''' state = torch.FloatTensor(state).unsqueeze(0).cuda() # state dim: (N, dim of state) action = self.forward(state) action = action.detach().cpu().numpy()[0] ''' add noise ''' normal = Normal(0, 1) noise = noise_scale * normal.sample(action.shape) action=self.action_range*action + noise.numpy() return action
def evaluate(self, state, eval_noise_scale, epsilon=1e-6): ''' generate action with state as input wrt the policy network, for calculating gradients ''' action = self.forward(state) ''' add noise ''' normal = Normal(0, 1) eval_noise_clip = 2 * eval_noise_scale noise = normal.sample(action.shape) * eval_noise_scale noise = torch.clamp(noise, -eval_noise_clip, eval_noise_clip) action = self.action_range * action + noise.cuda() return action
def act(self, state, std_scale, memory): state = torch.from_numpy(state).float().to(device) action_probs = self.action_layer(state) dist = Normal(loc=action_probs, scale=std_scale) action = dist.sample() action = action memory.states.append(state) memory.actions.append(action) memory.logprobs.append(dist.log_prob(action)) return action.detach().numpy()
def forward(self, x): """ Forward method implementation. x (torch.Tensor) :return: action (torch.Tensor) and dist """ mu, _, std = self.get_dist_params(x) # get normal distribution and action dist = Normal(mu, std) action = dist.sample() return action, dist
def evaluate(self, state): batch_mu, batch_log_sigma = self.policy_net(state) batch_sigma = torch.exp(batch_log_sigma) dist = Normal(batch_mu, batch_sigma) noise = Normal(0, 1) z = noise.sample() #获取动作 action = torch.tanh(batch_mu + batch_sigma * z.to(self.device)) # 后半部分是修正项 log_prob = dist.log_prob(batch_mu + batch_sigma * z.to(self.device)) - torch.log( 1 - action.pow(2) + self.min_Val) return action * self.max_action, log_prob
D.train() for epoch in range(NUM_EPOCHS): total_gen_loss = 0 total_disc_loss = 0 total = 0 for img, label in train_loader: if img.size(0) < BATCH_SIZE: continue img = V(img).cuda() # Grad discriminator real: -E[log(D(x))] optim_disc.zero_grad() optim_gen.zero_grad() d = D(img) loss_a = -d.log().mean() loss_a.backward() # Grad discriminator fake: -E[log(1 - D(G(z)) )] seed = seed_distribution.sample() x_fake = G(seed) d = D(x_fake.detach()) loss_b = -(1 - d + 1e-10).log().mean() loss_b.backward() optim_disc.step() total_disc_loss += loss_a.item() + loss_b.item() # Grad generator: E[log(1 - D(G(z)))] # optim_disc.zero_grad() d = D(x_fake) # no detach here # loss_c = (1 - d + 1e-10).log().mean() loss_c = -(d + 1e-10).log().mean() loss_c.backward() optim_gen.step() total_gen_loss += loss_c.item() total += 1