def train(rank, args, shared_model, optimizer, counter, lock): env = gym.make(args.env_name) env.seed(args.seed + rank) torch.manual_seed(args.seed + rank) model = Policy(2, action_map) model.train() state = env.reset() # state = tensor_state(state) done = True episode_length = 0 while True: # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = torch.zeros(1, 64) hx = torch.zeros(1, 64) else: cx = cx.data hx = hx.data values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): episode_length += 1 action, hx, cx = model(state, hx, cx) entropies.append(model.entropy) state, reward, done, _ = env.step(action) reward = max(min(reward, 1), -1) with lock: counter.value += 1 if done: episode_length = 0 state = env.reset() values.append(model.v) log_probs.append(model.log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: model(state, hx, cx) R = model.v.data values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - log_probs[i] * gae - args.entropy_coef * entropies[i] loss = policy_loss + args.value_loss_coef * value_loss optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
def train(): env = gym.make('Pong-v0') # env.seed(args.seed + rank) # torch.manual_seed(args.seed + rank) model = Policy(2, action_map) model.train() # model.eval() optimizer = optim.Adam(model.parameters()) state = env.reset() # state = tensor_state(state) done = True episode_length = 0 while True: if done: model.reset_hidden() values = [] log_probs = [] rewards = [] entropies = [] while True: episode_length += 1 action = model(state) entropies.append(model.entropy) state, reward, done, _ = env.step(action) reward = max(min(reward, 1), -1) if done: episode_length = 0 state = env.reset() values.append(model.v) log_probs.append(model.log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = 0.99 * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + 0.99 * values[i + 1].data - values[i].data gae = gae * 0.99 + delta_t policy_loss = policy_loss - log_probs[i] * gae - 0.01 * entropies[i] loss = policy_loss + 0.5 * value_loss optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 50) # ensure_shared_grads(model, shared_model) optimizer.step() print('loss:', loss)