def train_worker(args, shared_model, total_steps, optimizer, lock): env = make_env(args) args = args.train model = ActorCritic(env.observation_space.shape, env.action_space.n) model.train() state = env.reset() state = torch.FloatTensor(state) while True: model.load_state_dict(shared_model.state_dict()) model.detach_hidden() values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.update_agent_frequency): value, logit = model(state.unsqueeze(0)) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) state, reward, done, _ = env.step(action.numpy()) with total_steps.get_lock(): total_steps.value += 1 if done: state = env.reset() model.reset_hidden() state = torch.FloatTensor(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _ = model(state.unsqueeze(0)) R = value.detach() values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * values[i + 1] - values[i] gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - log_probs[i] * gae.detach( ) - args.entropy_weight * entropies[i] optimizer.zero_grad() (policy_loss + args.value_weight * value_loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) with lock: ensure_shared_grads(model, shared_model) optimizer.step()
def train_worker(args, shared_model, total_steps, optimizer, lock): env = make_env(args.environment) args = args.train if args.sample_entropy: args.entropy_weight = np.exp( np.random.uniform(np.log(0.0005), np.log(0.01))) if args.sample_lr: args.learning_rate = np.exp( np.random.uniform(np.log(0.0001), np.log(0.005))) model = ActorCritic(env.observation_space.shape, env.action_space.n) model = BaseWrapper(model) if (args.use_pixel_control or args.use_reward_prediction): model = ExperienceWrapper(model) if args.use_pixel_control: model = PixelControlWrapper(model, args.gamma, args.pc_coef) if args.use_reward_prediction: model = RewardPredictionWrapper(model, args.rp_coef) if args.use_value_replay: model = ValueReplayWrapper(model) model.train() curiosity_rewarder = CuriosityRewarder(env.observation_space.shape, env.action_space.n) curiosity_rewarder.train() curiosity_optimizer = optim.Adam(curiosity_rewarder.parameters()) state = env.reset() state = torch.FloatTensor(state) last_act = 0 sum_reward = 0 last_reward = 0 while True: model.load_state_dict(shared_model.state_dict()) model.detach_hidden() values = [] log_probs = [] rewards = [] curiosity_rewards = [] entropies = [] for step in range(args.update_agent_frequency): value, logit = model((state.unsqueeze(0), last_act, sum_reward)) prob = F.relu(F.softmax(logit, dim=-1)) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) act = action.numpy()[0][0] next_state, reward, done, _ = env.step(act) if (args.use_pixel_control or args.use_reward_prediction or args.use_value_replay): tr = Transaction(state, next_state, act, reward, done, last_act, last_reward, sum_reward) model.add_frame(tr) last_reward = reward last_act = act sum_reward += reward with total_steps.get_lock(): total_steps.value += 1 if done: sum_reward = 0 last_act = 0 last_reward = 0 next_state = env.reset() model.reset_hidden() next_state = torch.FloatTensor(next_state) curiosity_reward = curiosity_rewarder.get_reward(state.unsqueeze(0), action, next_state.unsqueeze(0)) state = next_state values.append(value) log_probs.append(log_prob) rewards.append(reward) curiosity_rewards.append(curiosity_reward) if done: break R = torch.zeros(1, 1) if not done: value, _ = model((state.unsqueeze(0), last_act, sum_reward)) R = value.detach() values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): # print(rewards[i], args.curiosity_weight * curiosity_rewards[i].detach()) R = args.gamma * R + rewards[i] + args.curiosity_weight * curiosity_rewards[i].detach() advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * values[i + 1] - values[i] gae = gae * args.gamma * args.tau + delta_t # print('lp:', log_probs[i], 'gae:', gae.detach(), 'ent:', entropies[i]) policy_loss = policy_loss - log_probs[i] * gae.detach() - args.entropy_weight * entropies[i] curiosity_optimizer.zero_grad() curiosity_loss = sum(map(lambda x: x**2, curiosity_rewards)) / len(curiosity_rewards) curiosity_loss.backward() curiosity_optimizer.step() optimizer.zero_grad() (policy_loss + args.value_weight * value_loss + model.get_loss()).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) with lock: ensure_shared_grads(model, shared_model) optimizer.step() model.reset()