def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) if not os.path.isdir(opt.saved_path): os.makedirs(opt.saved_path) mp = _mp.get_context("spawn") envs = MultipleEnvironments(opt.world, opt.stage, opt.action_type, opt.num_processes) model = PPO(envs.num_states, envs.num_actions) if torch.cuda.is_available(): model.cuda() model.share_memory() process = mp.Process(target=eval, args=(opt, model, envs.num_states, envs.num_actions)) process.start() optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) [agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns] curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns] curr_states = torch.from_numpy(np.concatenate(curr_states, 0)) if torch.cuda.is_available(): curr_states = curr_states.cuda() curr_episode = 0 episode_plot = [] R_plot = [] ep_reward_plot = [] start_datetime = datetime.datetime.now().strftime("%m-%d_%H-%M") while True: if curr_episode % opt.save_interval == 0 and curr_episode > 0: # torch.save(model.state_dict(), # "{}/ppo_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage)) torch.save( model.state_dict(), "{}/ppo_super_mario_bros_{}_{}_{}".format( opt.saved_path, opt.world, opt.stage, curr_episode)) curr_episode += 1 episode_plot.append(int(curr_episode)) old_log_policies = [] actions = [] values = [] states = [] rewards = [] dones = [] for _ in range(opt.num_local_steps): states.append(curr_states) logits, value = model(curr_states) values.append(value.squeeze()) policy = F.softmax(logits, dim=1) old_m = Categorical(policy) action = old_m.sample() actions.append(action) old_log_policy = old_m.log_prob(action) old_log_policies.append(old_log_policy) # before step with env if torch.cuda.is_available(): [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu()) ] else: [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action) ] state, reward, done, info = zip( *[agent_conn.recv() for agent_conn in envs.agent_conns]) state = torch.from_numpy(np.concatenate(state, 0)) if torch.cuda.is_available(): state = state.cuda() reward = torch.cuda.FloatTensor(reward) done = torch.cuda.FloatTensor(done) else: reward = torch.FloatTensor(reward) done = torch.FloatTensor(done) rewards.append(reward) dones.append(done) curr_states = state _, next_value, = model(curr_states) next_value = next_value.squeeze() old_log_policies = torch.cat(old_log_policies).detach() actions = torch.cat(actions) values = torch.cat(values).detach() states = torch.cat(states) gae = 0 R = [] for value, reward, done in list(zip(values, rewards, dones))[::-1]: # calc advantage gae = gae * opt.gamma * opt.tau gae = gae + reward + opt.gamma * next_value.detach() * ( 1 - done) - value.detach() next_value = value R.append(gae + value) R = R[::-1] R = torch.cat(R).detach() advantages = R - values print("mean big R:", torch.mean(R).item()) episode_reward_mean = torch.stack(rewards).mean( dim=1, keepdim=True).sum().item() print("mean reward", episode_reward_mean) R_plot.append(torch.mean(R).item()) ep_reward_plot.append(episode_reward_mean) plt.plot(episode_plot, R_plot, "r-") plt.xlabel('Episode') plt.ylabel('Mean R (PPO)') plt.savefig("ppo_R_episode_{}.pdf".format(start_datetime)) plt.close() plt.plot(episode_plot, ep_reward_plot, "r-") plt.xlabel('Episode') plt.ylabel('Mean Reward (PPO)') plt.savefig("ppo_reward_episode_{}.pdf".format(start_datetime)) plt.close() np.savetxt("ppo_R_episode_{}.csv".format(start_datetime), np.array(R_plot), delimiter=",") np.savetxt("ppo_reward_episode_{}.csv".format(start_datetime), np.array(ep_reward_plot), delimiter=",") for i in range(opt.num_epochs): indice = torch.randperm(opt.num_local_steps * opt.num_processes) for j in range(opt.batch_size): batch_indices = indice[int(j * ( opt.num_local_steps * opt.num_processes / opt.batch_size)):int((j + 1) * (opt.num_local_steps * opt.num_processes / opt.batch_size))] logits, value = model(states[batch_indices]) new_policy = F.softmax(logits, dim=1) new_m = Categorical(new_policy) new_log_policy = new_m.log_prob(actions[batch_indices]) ratio = torch.exp(new_log_policy - old_log_policies[batch_indices]) # ratio actor_loss = -torch.mean( torch.min( ratio * advantages[batch_indices], torch.clamp(ratio, 1.0 - opt.epsilon, 1.0 + opt.epsilon) * advantages[batch_indices])) # cliping # critic_loss = torch.mean((R[batch_indices] - value) ** 2) / 2 critic_loss = F.smooth_l1_loss(R[batch_indices], value.squeeze()) entropy_loss = torch.mean(new_m.entropy()) total_loss = actor_loss + critic_loss - opt.beta * entropy_loss optimizer.zero_grad() total_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) # model clip optimizer.step() print("Episode: {}. Total loss: {}".format(curr_episode, total_loss))
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) if not os.path.isdir(opt.saved_path): os.makedirs(opt.saved_path) mp = _mp.get_context("spawn") envs = MultipleEnvironments(opt.zone, opt.act, opt.num_processes) model = PPO(envs.num_states, envs.num_actions) if torch.cuda.is_available(): model.cuda() model.share_memory() process = mp.Process(target=test, args=(opt, model, envs.num_states, envs.num_actions)) process.start() optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) [agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns] curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns] curr_states = torch.from_numpy(np.concatenate(curr_states, 0)) if torch.cuda.is_available(): curr_states = curr_states.cuda() curr_episode = 0 while True: curr_episode += 1 old_log_policies = [] actions = [] values = [] states = [] rewards = [] dones = [] for _ in range(opt.num_local_steps): states.append(curr_states) logits, value = model(curr_states) values.append(value.squeeze()) policy = F.softmax(logits, dim=1) old_m = Categorical(policy) action = old_m.sample() actions.append(action) old_log_policy = old_m.log_prob(action) old_log_policies.append(old_log_policy) if torch.cuda.is_available(): [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu()) ] else: [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action) ] state, reward, done, info = zip( *[agent_conn.recv() for agent_conn in envs.agent_conns]) state = torch.from_numpy(np.concatenate(state, 0)) if torch.cuda.is_available(): state = state.cuda() reward = torch.cuda.FloatTensor(reward) done = torch.cuda.FloatTensor(done) else: reward = torch.FloatTensor(reward) done = torch.FloatTensor(done) rewards.append(reward) dones.append(done) curr_states = state _, next_value, = model(curr_states) next_value = next_value.squeeze() old_log_policies = torch.cat(old_log_policies).detach() actions = torch.cat(actions) values = torch.cat(values).detach() states = torch.cat(states) gae = 0 R = [] for value, reward, done in list(zip(values, rewards, dones))[::-1]: gae = gae * opt.gamma * opt.tau gae = gae + reward + opt.gamma * next_value.detach() * ( 1 - done) - value.detach() next_value = value R.append(gae + value) R = R[::-1] R = torch.cat(R).detach() advantages = R - values for i in range(opt.num_epochs): indice = torch.randperm(opt.num_local_steps * opt.num_processes) for j in range(opt.batch_size): batch_indices = indice[int(j * ( opt.num_local_steps * opt.num_processes / opt.batch_size)):int((j + 1) * (opt.num_local_steps * opt.num_processes / opt.batch_size))] logits, value = model(states[batch_indices]) new_policy = F.softmax(logits, dim=1) new_m = Categorical(new_policy) new_log_policy = new_m.log_prob(actions[batch_indices]) ratio = torch.exp(new_log_policy - old_log_policies[batch_indices]) actor_loss = -torch.mean( torch.min( ratio * advantages[batch_indices], torch.clamp(ratio, 1.0 - opt.epsilon, 1.0 + opt.epsilon) * advantages[batch_indices])) critic_loss = F.smooth_l1_loss(R[batch_indices], value.squeeze()) entropy_loss = torch.mean(new_m.entropy()) total_loss = actor_loss + critic_loss - opt.beta * entropy_loss optimizer.zero_grad() total_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() print("Episode: {}. Total loss: {}".format(curr_episode, total_loss))
def train(args): # 固定初始化状态 if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) # 创建保存模型的文件夹 if not os.path.isdir(args.saved_path): os.makedirs(args.saved_path) # 创建多进程的游戏环境 envs = MultipleEnvironments(args.game, args.num_processes) # 创建模型 model = PPO(envs.num_states, envs.num_actions) # 加载预训练模型 if args.trained_model is not None: model.load_state_dict(torch.load(args.trained_model)) # 使用 GPU训练 if torch.cuda.is_available(): model.cuda() model.share_memory() # 为游戏评估单独开一个进程 mp = _mp.get_context("spawn") process = mp.Process(target=eval, args=(args, model, envs.num_states, envs.num_actions)) process.start() # 创建优化方法 optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # 刚开始给每个进程的游戏执行初始化 [agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns] # 获取游戏初始的界面 curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns] curr_states = torch.from_numpy(np.concatenate(curr_states, 0)) if torch.cuda.is_available(): curr_states = curr_states.cuda() curr_episode = 0 while True: curr_episode += 1 old_log_policies = [] actions = [] values = [] states = [] rewards = [] dones = [] # 执行游戏获取数据 for _ in range(args.num_local_steps): states.append(curr_states) # 执行预测 logits, value = model(curr_states) # 计算每个动作的概率值 policy = F.softmax(logits, dim=1) # 根据每个标签的概率随机生成符合概率的标签 old_m = Categorical(policy) action = old_m.sample() # 记录预测数据 actions.append(action) values.append(value.squeeze()) # 计算损失使用 old_log_policy = old_m.log_prob(action) old_log_policies.append(old_log_policy) # 向各个进程游戏发送动作 if torch.cuda.is_available(): [agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu())] else: [agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action)] # 将多进程的游戏数据打包 state, reward, done, info = zip(*[agent_conn.recv() for agent_conn in envs.agent_conns]) # 进行数据转换 state = torch.from_numpy(np.concatenate(state, 0)) # 转换为pytorch数据 if torch.cuda.is_available(): state = state.cuda() reward = torch.cuda.FloatTensor(reward) done = torch.cuda.FloatTensor(done) else: reward = torch.FloatTensor(reward) done = torch.FloatTensor(done) # 记录预测数据 rewards.append(reward) dones.append(done) curr_states = state # 根据上面最后的图像预测 _, next_value, = model(curr_states) next_value = next_value.squeeze() old_log_policies = torch.cat(old_log_policies).detach() actions = torch.cat(actions) values = torch.cat(values).detach() states = torch.cat(states) gae = 0 R = [] for value, reward, done in list(zip(values, rewards, dones))[::-1]: gae = gae * args.gamma * args.tau gae = gae + reward + args.gamma * next_value.detach() * (1 - done) - value.detach() next_value = value R.append(gae + value) R = R[::-1] R = torch.cat(R).detach() advantages = R - values total_losses = [] for i in range(args.num_epochs): indice = torch.randperm(args.num_local_steps * args.num_processes) for j in range(args.batch_size): batch_indices = indice[ int(j * (args.num_local_steps * args.num_processes / args.batch_size)): int((j + 1) * ( args.num_local_steps * args.num_processes / args.batch_size))] # 根据拿到的图像执行预测 logits, value = model(states[batch_indices]) # 计算每个动作的概率值 new_policy = F.softmax(logits, dim=1) new_m = Categorical(new_policy) # 计算损失 new_log_policy = new_m.log_prob(actions[batch_indices]) ratio = torch.exp(new_log_policy - old_log_policies[batch_indices]) actor_loss = -torch.mean(torch.min(ratio * advantages[batch_indices], torch.clamp(ratio, 1.0 - args.epsilon, 1.0 + args.epsilon) * advantages[batch_indices])) critic_loss = F.smooth_l1_loss(R[batch_indices], value.squeeze()) entropy_loss = torch.mean(new_m.entropy()) total_loss = actor_loss + critic_loss - args.beta * entropy_loss # 计算梯度 optimizer.zero_grad() total_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() total_losses.append(float(total_loss)) print("Episode: {}. Total loss: {:.4f}".format(curr_episode, np.mean(total_losses))) torch.save(model.state_dict(), "{}/model_{}.pth".format(args.saved_path, args.game))
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) opt.saved_path = os.getcwd() + '/baselines/PPO/' + opt.saved_path # if os.path.isdir(opt.log_path): # shutil.rmtree(opt.log_path) # os.makedirs(opt.log_path) if not os.path.isdir(opt.saved_path): os.makedirs(opt.saved_path) savefile = opt.saved_path + '/PPO_train.csv' print(savefile) title = ['Loops', 'Steps', 'Time', 'AvgLoss', 'MeanReward', "StdReward", "TotalReward", "Flags"] with open(savefile, 'w', newline='') as sfile: writer = csv.writer(sfile) writer.writerow(title) # Create environments envs = MultipleEnvironments(opt.world, opt.stage, opt.action_type, opt.num_processes, opt.cortex_left, opt.cortex_right, opt.retina_resolution, opt.retina, opt.save_video) # Create model and optimizer model = PPO(envs.num_states, envs.num_actions) if torch.cuda.is_available(): model.cuda() model.share_memory() optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) # Start test/evaluation model if TEST_ON_THE_GO: # evaluate(opt, model, envs.num_states, envs.num_actions) mp = _mp.get_context("spawn") process = mp.Process(target=evaluate, args=(opt, model, envs.num_states, envs.num_actions)) process.start() # Reset envs #[agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns] curr_states = [] [curr_states.append(env.reset()) for env in envs.envs] # curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns] curr_states = torch.from_numpy(np.concatenate(curr_states, 0)) if torch.cuda.is_available(): curr_states = curr_states.cuda() tot_loops = 0 tot_steps = 0 # Start main loop while True: # Save model each loop if opt.save_with_interval: if tot_loops % opt.save_interval == 0 and tot_loops > 0: # torch.save(model.state_dict(), "{}/ppo_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage)) torch.save(model.state_dict(), "{}/ppo_super_mario_bros_{}_{}_{}".format(opt.saved_path, opt.world, opt.stage, tot_loops)) start_time = time.time() # Accumulate evidence tot_loops += 1 old_log_policies = [] actions = [] values = [] states = [] rewards = [] dones = [] flags = [] for _ in range(opt.num_local_steps): # From given states, predict an action states.append(curr_states) logits, value = model(curr_states) values.append(value.squeeze()) policy = F.softmax(logits, dim=1) old_m = Categorical(policy) action = old_m.sample() actions.append(action) old_log_policy = old_m.log_prob(action) old_log_policies.append(old_log_policy) # Evaluate predicted action result = [] # ac = action.cpu().item() if torch.cuda.is_available(): # [agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu())] [result.append(env.step(act.item())) for env, act in zip(envs.envs, action.cpu())] else: #[agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action)] [result.append(env.step(act.item())) for env, act in zip(envs.envs, action)] state, reward, done, info = zip(*result) state = torch.from_numpy(np.concatenate(state, 0)) if torch.cuda.is_available(): state = state.cuda() reward = torch.cuda.FloatTensor(reward) done = torch.cuda.FloatTensor(done) else: reward = torch.FloatTensor(reward) done = torch.FloatTensor(done) rewards.append(reward) dones.append(done) flags.append(check_flag(info) / opt.num_processes) curr_states = state # Training stage _, next_value, = model(curr_states) next_value = next_value.squeeze() old_log_policies = torch.cat(old_log_policies).detach() actions = torch.cat(actions) values = torch.cat(values).detach() states = torch.cat(states) gae = 0 R = [] for value, reward, done in list(zip(values, rewards, dones))[::-1]: gae = gae * opt.gamma * opt.tau gae = gae + reward + opt.gamma * next_value.detach() * (1 - done) - value.detach() next_value = value R.append(gae + value) R = R[::-1] R = torch.cat(R).detach() advantages = R - values avg_loss = [] for _ in range(opt.num_epochs): indice = torch.randperm(opt.num_local_steps * opt.num_processes) for j in range(opt.batch_size): batch_indices = indice[int(j * (opt.num_local_steps * opt.num_processes / opt.batch_size)): int((j + 1) * ( opt.num_local_steps * opt.num_processes / opt.batch_size))] logits, value = model(states[batch_indices]) new_policy = F.softmax(logits, dim=1) new_m = Categorical(new_policy) new_log_policy = new_m.log_prob(actions[batch_indices]) ratio = torch.exp(new_log_policy - old_log_policies[batch_indices]) actor_loss = -torch.mean(torch.min(ratio * advantages[batch_indices], torch.clamp(ratio, 1.0 - opt.epsilon, 1.0 + opt.epsilon) * advantages[batch_indices])) # critic_loss = torch.mean((R[batch_indices] - value) ** 2) / 2 critic_loss = F.smooth_l1_loss(R[batch_indices], value.squeeze()) entropy_loss = torch.mean(new_m.entropy()) total_loss = actor_loss + critic_loss - opt.beta * entropy_loss optimizer.zero_grad() total_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() avg_loss.append(total_loss.cpu().detach().numpy().tolist()) avg_loss = np.mean(avg_loss) all_rewards = torch.cat(rewards).cpu().numpy() tot_steps += opt.num_local_steps * opt.num_processes sum_reward = np.sum(all_rewards) mu_reward = np.mean(all_rewards) std_reward = np.std(all_rewards) any_flags = np.sum(flags) ep_time = time.time() - start_time # data = [tot_loops, tot_steps, ep_time, avg_loss, mu_reward, std_reward, sum_reward, any_flags] data = [tot_loops, tot_steps, "{:.6f}".format(ep_time), "{:.4f}".format(avg_loss), "{:.4f}".format(mu_reward), "{:.4f}".format(std_reward), "{:.2f}".format(sum_reward), any_flags] with open(savefile, 'a', newline='') as sfile: writer = csv.writer(sfile) writer.writerows([data]) print("Steps: {}. Total loss: {}".format(tot_steps, total_loss))
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) if not os.path.isdir(opt.saved_path): os.makedirs(opt.saved_path) mp = _mp.get_context("spawn") envs = MultipleEnvironments(opt.world, opt.stage, opt.action_type, opt.num_processes) model_mast = PPO(envs.num_states, envs.num_actions) model_1 = PPO(envs.num_states, envs.num_actions) model_2 = PPO(envs.num_states, envs.num_actions) model_1.eval() if torch.cuda.is_available(): try: model_1.load_state_dict( torch.load("{}/ppo_assistance_{}_{}".format( opt.saved_path, opt.world, opt.stage))) model_1.cuda() print('model-1 is loaded cuda version') except: print('failed to load model-1') try: model_2.load_state_dict( torch.load("{}/ppo_secndpt_{}_{}_{}".format( opt.saved_path, opt.world, opt.stage, opt.saved_episode))) model_2.cuda() print('model-2 is loaded cuda version') except: print('failed to load model-2') else: try: model_1.load_state_dict( torch.load("{}/ppo_assistance_{}_{}".format( opt.saved_path, opt.world, opt.stage), map_location=lambda storage, loc: storage)) print('model-1 is loaded non cuda version') except: print('Failed to load model-1') try: model_2.load_state_dict( torch.load("{}/ppo_scendpt_{}_{}_{}".format( opt.saved_path, opt.world, opt.stage, opt.saved_episode), map_location=lambda storage, loc: storage)) print('model-2 is loaded non cuda version') except: print('Failed to load non cuda model-2') model_mast.load_state_dict(model_2.state_dict()) if torch.cuda.is_available(): model_mast.cuda() model_mast.share_memory() process = mp.Process(target=eval, args=(opt, model_mast, envs.num_states, envs.num_actions)) process.start() optimizer = torch.optim.Adam(model_mast.parameters(), lr=opt.lr) [agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns] curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns] curr_states = torch.from_numpy(np.concatenate(curr_states, 0)) if torch.cuda.is_available(): curr_states = curr_states.cuda() curr_episode = opt.saved_episode while True: curr_episode += 1 old_log_policies = [] actions = [] values = [] states = [] rewards = [] dones = [] print( '############## restarting the training loop ###################' ) while True: while True: logits, value = model_1(curr_states) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() action = torch.tensor(action) action = action.view(-1) if torch.cuda.is_available(): [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu()) ] else: [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action) ] state, reward, done, info = zip( *[agent_conn.recv() for agent_conn in envs.agent_conns]) # print('position is',info[0]['x_pos']) if info[0]['x_pos'] > 1000: # print('starting sample collection') break else: state = torch.from_numpy(np.concatenate(state, 0)) curr_states = state state = torch.from_numpy(np.concatenate(state, 0)) curr_states = state for _ in range(opt.num_local_steps): states.append(curr_states) logits, value = model_mast(curr_states) values.append(value.squeeze()) policy = F.softmax(logits, dim=1) old_m = Categorical(policy) action = old_m.sample() actions.append(action) old_log_policy = old_m.log_prob(action) old_log_policies.append(old_log_policy) if torch.cuda.is_available(): [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu()) ] else: [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action) ] state, reward, done, info = zip( *[agent_conn.recv() for agent_conn in envs.agent_conns]) state = torch.from_numpy(np.concatenate(state, 0)) if torch.cuda.is_available(): state = state.cuda() reward = torch.cuda.FloatTensor(reward) done = torch.cuda.FloatTensor(done) else: reward = torch.FloatTensor(reward) done = torch.FloatTensor(done) rewards.append(reward) dones.append(done) curr_states = state if done: # print('samples collected ',len(states)) break if len(states) >= opt.num_local_steps: # print('entring training loop. states list size is ', len(states)) _, next_value, = model_mast(curr_states) next_value = next_value.squeeze() old_log_policies = torch.cat(old_log_policies).detach() actions = torch.cat(actions) values = torch.Tensor(values).detach() # values = torch.cat(values).detach() states = torch.cat(states) gae = 0 R = [] for value, reward, done in list(zip(values, rewards, dones))[::-1]: gae = gae * opt.gamma * opt.tau gae = gae + reward + opt.gamma * next_value.detach() * ( 1 - done) - value.detach() next_value = value R.append(gae + value) R = R[::-1] R = torch.cat(R).detach() advantages = R - values for i in range(opt.num_epochs): indice = torch.randperm(opt.num_local_steps * opt.num_processes) for j in range(opt.batch_size): batch_indices = indice[int(j * ( opt.num_local_steps * opt.num_processes / opt.batch_size)):int((j + 1) * (opt.num_local_steps * opt.num_processes / opt.batch_size))] logits, value = model_mast(states[batch_indices]) new_policy = F.softmax(logits, dim=1) new_m = Categorical(new_policy) new_log_policy = new_m.log_prob(actions[batch_indices]) ratio = torch.exp(new_log_policy - old_log_policies[batch_indices]) actor_loss = -torch.mean( torch.min( ratio * advantages[batch_indices], torch.clamp(ratio, 1.0 - opt.epsilon, 1.0 + opt.epsilon) * advantages[batch_indices])) # critic_loss = torch.mean((R[batch_indices] - value) ** 2) / 2 critic_loss = F.smooth_l1_loss(R[batch_indices], value.squeeze()) entropy_loss = torch.mean(new_m.entropy()) total_loss = actor_loss + critic_loss - opt.beta * entropy_loss optimizer.zero_grad() total_loss.backward() torch.nn.utils.clip_grad_norm_(model_mast.parameters(), 0.5) optimizer.step() print("Episode: {}. Total loss: {}".format( curr_episode, total_loss)) try: if os.path.exists('{}/ppo_scendpt_{}_{}_{}'.format( opt.saved_path, opt.world, opt.stage, (curr_episode - 1))): # print('removing past saved data of episode',curr_episode) os.remove('{}/ppo_scendpt_{}_{}_{}'.format( opt.saved_path, opt.world, opt.stage, (curr_episode - 1))) except: print('failed to remove past saved model') torch.save( model_mast.state_dict(), "{}/ppo_scendpt_{}_{}_{}".format(opt.saved_path, opt.world, opt.stage, curr_episode)) break else: print('reseting training ') opt.saved_episode = curr_episode