def train_worker(args, shared_model, total_steps, optimizer, lock): env = make_env(args) args = args.train model = ActorCritic(env.observation_space.shape, env.action_space.n) model.train() state = env.reset() state = torch.FloatTensor(state) while True: model.load_state_dict(shared_model.state_dict()) model.detach_hidden() values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.update_agent_frequency): value, logit = model(state.unsqueeze(0)) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) state, reward, done, _ = env.step(action.numpy()) with total_steps.get_lock(): total_steps.value += 1 if done: state = env.reset() model.reset_hidden() state = torch.FloatTensor(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _ = model(state.unsqueeze(0)) R = value.detach() values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * values[i + 1] - values[i] gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - log_probs[i] * gae.detach( ) - args.entropy_weight * entropies[i] optimizer.zero_grad() (policy_loss + args.value_weight * value_loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) with lock: ensure_shared_grads(model, shared_model) optimizer.step()
) if __name__ == '__main__': cmd_args = parser.parse_args() config = Config.fromYamlFile('config.yaml') args = config.train args.__dict__.update(vars(cmd_args)) env = make_atari(args.env_name) shared_model = ActorCritic(env.observation_space.shape, env.action_space.n) if args.pretrained_weights is not None: shared_model.load_weights(args.pretrained_weights) shared_model.share_memory() optimizer = SharedAdam(shared_model.parameters(), lr=args.learning_rate) if args.pretrained_weights is not None: optimizer.load_params( args.pretrained_weights.replace('weights/', 'optimizer_params/')) optimizer.share_memory() processes = [] lock = mp.Lock() total_steps = Value('i', 0) p = mp.Process(target=test_worker, args=(args, shared_model, total_steps, optimizer)) p.start() processes.append(p)
or config.train.use_value_replay): shared_model = ExperienceWrapper(shared_model) if config.train.use_pixel_control: shared_model = PixelControlWrapper(shared_model, config.train.gamma, config.train.pc_coef) if config.train.use_reward_prediction: shared_model = RewardPredictionWrapper(shared_model, config.train.rp_coef) if config.train.use_value_replay: shared_model = ValueReplayWrapper(shared_model) if config.train.pretrained_weights is not None: shared_model.load_state_dict( torch.load(config.train.pretrained_weights)) shared_model.share_memory() optimizer = SharedAdam(shared_model.parameters(), lr=config.train.learning_rate) if config.train.pretrained_optimizer is not None: optimizer.load_state_dict(torch.load( config.train.pretrained_optimizer)) optimizer.share_memory() processes = [] lock = mp.Lock() total_steps = Value('i', 0) p = mp.Process(target=test_worker, args=(config, shared_model, total_steps, optimizer)) p.start() processes.append(p)
def train_worker(args, shared_model, total_steps, optimizer, lock): env = make_env(args.environment) args = args.train if args.sample_entropy: args.entropy_weight = np.exp( np.random.uniform(np.log(0.0005), np.log(0.01))) if args.sample_lr: args.learning_rate = np.exp( np.random.uniform(np.log(0.0001), np.log(0.005))) model = ActorCritic(env.observation_space.shape, env.action_space.n) model = BaseWrapper(model) if (args.use_pixel_control or args.use_reward_prediction): model = ExperienceWrapper(model) if args.use_pixel_control: model = PixelControlWrapper(model, args.gamma, args.pc_coef) if args.use_reward_prediction: model = RewardPredictionWrapper(model, args.rp_coef) if args.use_value_replay: model = ValueReplayWrapper(model) model.train() curiosity_rewarder = CuriosityRewarder(env.observation_space.shape, env.action_space.n) curiosity_rewarder.train() curiosity_optimizer = optim.Adam(curiosity_rewarder.parameters()) state = env.reset() state = torch.FloatTensor(state) last_act = 0 sum_reward = 0 last_reward = 0 while True: model.load_state_dict(shared_model.state_dict()) model.detach_hidden() values = [] log_probs = [] rewards = [] curiosity_rewards = [] entropies = [] for step in range(args.update_agent_frequency): value, logit = model((state.unsqueeze(0), last_act, sum_reward)) prob = F.relu(F.softmax(logit, dim=-1)) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) act = action.numpy()[0][0] next_state, reward, done, _ = env.step(act) if (args.use_pixel_control or args.use_reward_prediction or args.use_value_replay): tr = Transaction(state, next_state, act, reward, done, last_act, last_reward, sum_reward) model.add_frame(tr) last_reward = reward last_act = act sum_reward += reward with total_steps.get_lock(): total_steps.value += 1 if done: sum_reward = 0 last_act = 0 last_reward = 0 next_state = env.reset() model.reset_hidden() next_state = torch.FloatTensor(next_state) curiosity_reward = curiosity_rewarder.get_reward(state.unsqueeze(0), action, next_state.unsqueeze(0)) state = next_state values.append(value) log_probs.append(log_prob) rewards.append(reward) curiosity_rewards.append(curiosity_reward) if done: break R = torch.zeros(1, 1) if not done: value, _ = model((state.unsqueeze(0), last_act, sum_reward)) R = value.detach() values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): # print(rewards[i], args.curiosity_weight * curiosity_rewards[i].detach()) R = args.gamma * R + rewards[i] + args.curiosity_weight * curiosity_rewards[i].detach() advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * values[i + 1] - values[i] gae = gae * args.gamma * args.tau + delta_t # print('lp:', log_probs[i], 'gae:', gae.detach(), 'ent:', entropies[i]) policy_loss = policy_loss - log_probs[i] * gae.detach() - args.entropy_weight * entropies[i] curiosity_optimizer.zero_grad() curiosity_loss = sum(map(lambda x: x**2, curiosity_rewards)) / len(curiosity_rewards) curiosity_loss.backward() curiosity_optimizer.step() optimizer.zero_grad() (policy_loss + args.value_weight * value_loss + model.get_loss()).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) with lock: ensure_shared_grads(model, shared_model) optimizer.step() model.reset()