def main(): # 获取游戏 env = create_train_env(world=1, stage=1, actions=COMPLEX_MOVEMENT) print(env.observation_space.shape) print(env.action_space.n) obs = env.reset() while True: # 游戏生成的随机动作,int类型数值 action = env.action_space.sample() # 执行游戏 obs, reward, terminal, info = env.step(action) obs = np.squeeze(obs) obses = obs[0] for i in range(1, obs.shape[0]): obses = np.hstack([obses, obs[i]]) cv2.imshow('obes', obses) env.render() print("=" * 50) print("action:", action) print("obs shape:", obs.shape) print("reward:", reward) print("terminal:", terminal) print("info:", info) if terminal: obs = env.reset()
def main(): # 获取游戏 env = create_train_env(game="SuperMarioBros-Nes") print(env.observation_space.shape) print(env.action_space.n) obs = env.reset() while True: # 游戏生成的随机动作,int类型数值 action = env.action_space.sample() # 执行游戏 obs, reward, terminal, info = env.step(action) # 显示连续动作 obs = np.squeeze(obs) obses = obs[0] for i in range(1, obs.shape[0]): obses = np.hstack([obses, obs[i]]) cv2.imshow('obes', obses) cv2.waitKey(1) env.render() print("=" * 50) print("action:", action) print("obs shape:", obs.shape) print("reward:", reward) print("terminal:", terminal) print("info:", info) if terminal: obs = env.reset()
def infer(args): # 固定初始化状态 paddle.seed(123) # 使用 GPU预测 if paddle.is_compiled_with_cuda(): paddle.set_device("gpu:0") # 判断游戏动作类型 if args.action_type == "right": actions = RIGHT_ONLY elif args.action_type == "simple": actions = SIMPLE_MOVEMENT else: actions = COMPLEX_MOVEMENT # 创建游戏环境 env = create_train_env(args.world, args.stage, actions) # 创建模型 model = Model(env.observation_space.shape[0], len(actions)) # 加载模型参数文件 model_path = "{}/model_{}_{}_finish.pdparams".format( args.saved_path, args.world, args.stage) if not os.path.exists(model_path): model_path = "{}/model_{}_{}.pdparams".format(args.saved_path, args.world, args.stage) model.load_dict(paddle.load(model_path)) # 切换评估模式 model.eval() # 获取刚开始的游戏图像 state = paddle.to_tensor(env.reset(), dtype="float32") total_reward = 0 while True: # 显示界面 env.render() # 预测动作概率和评估值 logits, value = model(state) # 获取动作的序号 policy = F.softmax(logits, axis=1) action = paddle.argmax(policy)[0] # 执行游戏 state, reward, done, info = env.step(int(action)) total_reward += reward # 转换每一步都游戏状态 state = paddle.to_tensor(state, dtype="float32") print(info) # 游戏通关 if info["flag_get"]: print("World {} stage {} 通关".format(args.world, args.stage)) break if done: print("游戏结束,得分:%f, 未能通过!" % total_reward) break
def main(): args = get_args() device = torch.device('cuda' if args.cuda else 'cpu') env = create_train_env(1, args.difficulty, args.macro, 'env1.mp4') input_size = env.observation_space.shape[0] output_size = env.action_space.n model = RNNActorCriticNetwork(input_size, output_size, args.noise_linear).to(device) model.eval() dummy_input = torch.rand(1, 1, *env.observation_space.shape).to(device=device) writer = SummaryWriter(log_dir=args.log_dir) writer.add_graph(model, (dummy_input, ))
def local_test(index, opt, global_model, model_type=None): torch.manual_seed(42 + index) env, num_states, num_actions = create_train_env(opt.layout, index + 1, index=index) if model_type: AC_NN_MODEL = getattr(model, model_type)() else: AC_NN_MODEL = SimpleActorCriticLineal local_model = AC_NN_MODEL(num_states, num_actions) # Test model we are going to test (turn off dropout, no backward pass) local_model.eval() state = torch.from_numpy(env.reset()) done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) while True: curr_step += 1 if done: # Copy global model to local model local_model.load_state_dict(global_model.state_dict(), strict=False) with torch.no_grad(): if done: h_0 = torch.zeros((1, ACTOR_HIDDEN_SIZE), dtype=torch.float) c_0 = torch.zeros((1, CRITIC_HIDDEN_SIZE), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() logits, value, h_0, c_0 = local_model(state, h_0, c_0) value = value.clamp(-1., 1.) policy = F.softmax(logits, dim=0) action = torch.argmax(policy).item() state, reward, done, _ = env.step(action) state = torch.from_numpy(state) actions.append(action) if curr_step > opt.num_global_steps or actions.count( actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = torch.from_numpy(env.reset())
def train(opt: argparse.ArgumentParser): torch.manual_seed(42) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) if not os.path.isdir(opt.saved_path): os.makedirs(opt.saved_path) multi_processes = mp.get_context("spawn") env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type) global_model = ActorCritics(num_states, num_actions) if opt.use_gpu and torch.cuda.is_available(): global_model.cuda() global_model.share_memory() if opt.load_from_stage: if opt.stage == 1: previous_worldd = opt.world - 1 previous_stage = 4 else: previous_world = opt.world previous_stage = opt.stage - 1 file_ = f"{opt.saved_path}/a3c_super_mario_bros_{previous_world}_{previous_stage}" if os.path.isfile(file_): global_model.load_state_dict(file_) optimizer = GlobalAdam(global_model.parameters(), lr=opt.lr) processes = [] for pid in range(opt.num_processes): if pid == 0: process = multi_processes.Process(target=local_train, args=(pid, opt, global_model, optimizer, True)) else: process = multi_processes.Process(target=local_train, args=(pid, opt, global_model, optimizer)) process.start() processes.append(process) process = multi_processes.Process(target=local_train, args=(opt.num_processes, opt, global_model, optimizer)) process.start() processes.append(process) for process in processes: process.join()
def local_test(index, opt, global_model): torch.manual_seed(123 + index) env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type) local_model = ActorCritic(num_states, num_actions) local_model.eval() state = torch.from_numpy(env.reset()) done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) while True: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) with torch.no_grad(): if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() logits, value, h_0, c_0 = local_model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, _ = env.step(action) env.render() actions.append(action) if curr_step > opt.num_global_steps or actions.count( actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state)
def main(): args = get_args() device = torch.device('cuda' if args.cuda else 'cpu') env = create_train_env(1, args.difficulty, args.macro, 'env1.mp4') input_size = env.observation_space.shape[0] output_size = env.action_space.n model_path = os.path.join(args.save_dir, 'policy.cpt') model = RNNActorCriticNetwork(input_size, output_size, args.noise_linear).to(device) if args.cuda: model.load_state_dict(torch.load(model_path)) else: model.load_state_dict(torch.load(model_path, map_location='cpu')) model.eval() print('Testing...') # looping obs = env.reset() hidden = None sample_rall = 0 sample_step = 0 sample_max_stage = 0 done = False while not done: action, _, action_probs, hidden = get_action(model, device, obs[None, None, :], hidden) obs, rew, done, info = env.step(int(action)) sample_rall += rew sample_max_stage = max(sample_max_stage, info['stage']) sample_step += 1 print('Max Stage: %d | Reward: %f | Total Steps: %d' \ % (sample_max_stage, sample_rall, sample_step))
def infer(args): # 固定初始化状态 paddle.seed(123) # 使用 GPU预测 if paddle.is_compiled_with_cuda(): paddle.set_device("gpu:0") # 创建游戏环境 env = create_train_env(args.game) # 创建模型 model = Model(env.observation_space.shape[0], env.action_space.n) # 加载模型参数文件 model.load_dict( paddle.load("{}/model_best_{}.pdparams".format(args.saved_path, args.game))) # 切换评估模式 model.eval() # 获取刚开始的游戏图像 state = paddle.to_tensor(env.reset(), dtype="float32") total_reward = 0 while True: # 显示界面 env.render() # 预测动作概率和评估值 logits, value = model(state) # 获取动作的序号 policy = F.softmax(logits, axis=1) action = paddle.argmax(policy)[0] # 执行游戏 state, reward, done, info = env.step(int(action)) total_reward += reward print(info) # 转换每一步都游戏状态 state = paddle.to_tensor(state, dtype="float32") if done: print("游戏结束,得分:%f" % total_reward) break
def eval(args, num_states, num_actions): log_writer = LogWriter(logdir='log') # 固定初始化状态 paddle.seed(123) # 使用 GPU预测 if paddle.is_compiled_with_cuda(): paddle.set_device("gpu:0") # 创建游戏动作 env = create_train_env(args.game) # 获取网络模型 local_model = Model(num_states, num_actions) # 切换为评估状态 local_model.eval() # 将图像转换为Paddle的数据类型 state = paddle.to_tensor(env.reset(), dtype="float32") # 一开始就更新模型参数 done = True # 日志的记录步数 step = 0 # 旧模型的MD5 old_model_file_md5 = '' # 游戏总得分 total_reward = 0 max_reward = 0 while True: # 每结束一次就更新模型参数 if done: try: model_path = "{}/model_{}.pdparams".format( args.saved_path, args.game) # 使用文件的MD5保证每个模型只用一次 with open(model_path, 'rb') as f: file = f.read() file_md5 = hashlib.md5(file).hexdigest() if file_md5 == old_model_file_md5: continue else: model_dict = paddle.load(model_path) old_model_file_md5 = file_md5 except: continue total_reward = 0 local_model.load_dict(model_dict) # 预测动作概率和评估值 logits, value = local_model(state) # 获取动作的序号 policy = F.softmax(logits, axis=1) action = paddle.argmax(policy)[0] # 执行游戏 state, reward, done, info = env.step(int(action)) total_reward += reward # 显示界面 if args.show_play: env.render() # 重置游戏状态 if done: step += 1 state = env.reset() print('总得分是:%f' % total_reward) log_writer.add_scalar(tag='Eval reward', value=total_reward, step=step) if max_reward < total_reward: paddle.save( local_model.state_dict(), "{}/model_best_{}.pdparams".format(args.saved_path, args.game)) max_reward = total_reward # 转换每一步都游戏状态 state = paddle.to_tensor(state, dtype="float32")
if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='LunarLander-v2')#'HalfCheetah-v2') parser.add_argument('--hid', type=int, default=64) parser.add_argument('--l', type=int, default=2) parser.add_argument('--gamma', type=float, default=0.999) parser.add_argument('--seed', '-s', type=int, default=0) parser.add_argument('--cpu', type=int, default=1) parser.add_argument('--steps', type=int, default=4000) parser.add_argument('--epochs', type=int, default=350) parser.add_argument('--pretrain', type=str, default='/root/lele/spinningup/spinningup/data/ppo_0715/ppo_0715_s0/pyt_save/model.pt') parser.add_argument('--exp_name', type=str, default='ppo_lstm_1106') args = parser.parse_args() import os os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' mpi_fork(args.cpu) # run parallel code with mpi from spinup.utils.run_utils import setup_logger_kwargs logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) # from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv env_fn = lambda : create_train_env(1,1,'complex') # env_fn = SubprocVecEnv([]) # env_fn = lambda : JoypadSpace(gym_super_mario_bros.make("SuperMarioBros-{}-{}-v0".format(1, 1)), gym_super_mario_bros.actions.COMPLEX_MOVEMENT) ppo(env_fn, actor=userActor, critic=userCritic,#core.MLPActorCritic, #gym.make(args.env) ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), gamma=args.gamma, seed=args.seed, steps_per_epoch=args.steps, epochs=args.epochs, logger_kwargs=logger_kwargs, clip_ratio=0.2, pi_lr=0.001, vf_lr=0.001, pretrain=None)#args.pretrain)
if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) print('Episode %d \t EpRet %.3f \t EpLen %d' % (n, ep_ret, ep_len)) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 hidden = (torch.zeros((1, 512), dtype=torch.float).to(device), torch.zeros((1, 512), dtype=torch.float).to(device)) n += 1 logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.dump_tabular() if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('--fpath', '-f', type=str, default='./pretrain') parser.add_argument('--len', '-l', type=int, default=0) parser.add_argument('--episodes', '-n', type=int, default=100) parser.add_argument('--norender', '-nr', action='store_true') parser.add_argument('--itr', '-i', type=int, default=-1) parser.add_argument('--deterministic', '-d', action='store_true') args = parser.parse_args() # env, get_action = load_policy_and_env(args.fpath, # args.itr if args.itr >= 0 else 'last', # args.deterministic) env = create_train_env(1, 1, 'complx') get_action = load_pytorch_policy(args.fpath) #itr='_50' run_policy(env, get_action, args.len, args.episodes, not (args.norender))
def local_train(index, opt, global_model, optimizer, save=False): torch.manual_seed(123 + index) if save: start_time = timeit.default_timer() writer = SummaryWriter(opt.log_path) env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type) local_model = ActorCritic(num_states, num_actions) local_model.train() state = torch.from_numpy(env.reset()) done = True curr_step = 0 curr_episode = 0 while True: if save: if curr_episode % opt.save_interval == 0 and curr_episode > 0: torch.save( global_model.state_dict(), f"{opt.saved_path}/a3c_super_mario_bros_{opt.world}_{opt.stage}" ) print(f"Now Process {index}. Episode {curr_episode}") curr_episode += 1 local_model.load_state_dict(global_model.state_dict()) if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() log_policies = [] values = [] rewards = [] entropies = [] for _ in range(opt.num_local_steps): curr_step += 1 logits, value, h_0, c_0 = local_model(state, h_0, c_0) policy = F.softmax(logits, dim=1) log_policy = F.log_softmax(logits, dim=1) entropy = -(policy * log_policy).sum(1, keepdim=True) m = Categorical(policy) action = m.sample().item() state, reward, done, _ = env.step(action) state = torch.from_numpy(state) if curr_step > opt.num_global_steps: done = True if done: curr_step = 0 state = torch.from_numpy(env.reset()) values.append(value) log_policies.append(log_policy[0, action]) rewards.append(reward) entropies.append(entropy) if done: break R = torch.zeros((1, 1), dtype=torch.float) if not done: _, R, _, _ = local_model(state, h_0, c_0) gae = torch.zeros((1, 1), dtype=torch.float) actor_loss = 0 critic_loss = 0 entropy_loss = 0 next_value = R for value, log_policy, reward, entropy in list( zip(values, log_policies, rewards, entropies))[::-1]: gae = gae * opt.gamma * opt.tau gae = gae + reward + opt.gamma * next_value.detach( ) - value.detach() next_value = value actor_loss = actor_loss + log_policy * gae R = R * opt.gamma + reward critic_loss = critic_loss + (R - value)**2 / 2 entropy_loss = entropy_loss + entropy total_loss = -actor_loss + critic_loss - opt.beta * entropy_loss writer.add_scalar(f"Train_{index}/Loss", total_loss, curr_episode) optimizer.zero_grad() total_loss.backward() for local_param, global_param in zip(local_model.parameters(), global_model.parameters()): if global_param.grad is not None: break global_param._grad = local_param.grad optimizer.step() if curr_episode == int(opt.num_global_steps / opt.num_local_steps): print(f"Training process {index} terminated") if save: end_time = timeit.default_timer() print('The code runs for %.2f s ' % (end_time - start_time)) return
def local_train(index, opt, global_model, optimizer, save=False): torch.manual_seed(42 + index) if save: start_time = timeit.default_timer() writer = SummaryWriter(opt.log_path) env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type) state = torch.from_numpy(env.reset()) local_model = ActorCritics(num_states, num_actions) if opt.use_gpu and torch.cuda.is_available(): local_model = local_model.cuda() state = state.cuda() local_model.train() done = True cur_step = 0 cur_episode = 0 while True: if save: if cur_episode % opt.save_interval == 0 and cur_episode > 0: torch.save( global_model.state_dict(), f"{opt.saved_path}/a3c_super_mario_bros_{opt.world}_{opt.stage}" ) print(f"Process {index}. Episode {cur_episode}") cur_episode += 1 local_model.load_state_dict(global_model.state_dict()) if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() if opt.use_gpu and torch.cuda.is_available(): h_0 = h_0.cuda() c_0 = c_0.cuda() log_policies = [] values = [] rewards = [] entropies = [] # predict the action and react with the environment for _ in range(opt.num_local_steps): cur_step += 1 logits, value, h_0, c_0 = local_model(state, h_0, c_0) policy = F.softmax(logits, dim=1) log_policy = F.log_softmax(logits, dim=1) entropy = -(policy * log_policy).sum(1, keepdim=True) # get the next action from sampling m = Categorical(policy) action = m.sample().item() # react state, reward, done, _ = env.step(action) state = torch.from_numpy(state) if opt.use_gpu and torch.cuda.is_available(): state = state.cuda() # finishing of the episode if cur_step > opt.num_global_steps: done = True if done: cur_step = 0 state = torch.from_numpy(env.reset()) if opt.use_gpu and torch.cuda.is_available(): state = state.cuda() # aggregate the info values.append(value) log_policies.append(log_policy[0, action]) rewards.append(reward) entropies.append(entropy) if done: break # calculate the `R' and calculate loss according age R = torch.zeros((1, 1), dtype=torch.float) gae = torch.zeros((1, 1), dtype=torch.float) if opt.use_gpu and torch.cuda.is_available(): R = R.cuda() gae = gae.cuda() if not done: _, R, _, _ = local_model(state, h_0, c_0) actor_loss, critic_loss, entropy_loss = 0, 0, 0 next_value = R for value, log_policy, reward, entropy in list(values, log_policies, rewards, entropies)[::-1]: gae = gae * opt.gamma * opt.tau gae = gae + reward + opt.gamma * next_value.detach( ) - value.detach() next_value = value actor_loss = actor_loss + log_policy * gae R = R * opt.gamma + reward critic_loss = critic_loss + (R - value)**2 / 2 entropy_loss = entropy_loss + entropy # backward total_loss = critic_loss - actor_loss - opt.beta * entropy_loss writer.add_scalar(f"Train_{index}/Loss", total_loss, cur_episode) optimizer.zero_grad() total_loss.backward() for local_param, global_param in zip(local_model.parameters(), global_model.parameters()): if global_param.grad: break global_param._grad = local_param.grad optimizer.step() if cur_episode == int(opt.num_global_steps / opt.num_local_steps): print(f"Training process {index} terminated") if save: end_time = timeit.default_timer() print(f"The code runs for {end_time -start_time} s") return
def local_train(index, opt, global_model, optimizer, save=False): torch.manual_seed(42 + index) if save: start_time = timeit.default_timer() if index==0: # Path for tensorboard log process_log_path = "{}/process-{}".format(opt.log_path, index) writer = SummaryWriter(process_log_path)#, max_queue=1000, flush_secs=10) # Creates training environment for this particular process env, num_states, num_actions = create_train_env(opt.layout, opt.num_processes_to_render, index=index) # local_model keeps local weights for each async process local_model = AC_NN_MODEL(num_states, num_actions) if opt.use_gpu: local_model.cuda() # Tell the model we are going to use it for training local_model.train() # env.reset and get first state state = torch.from_numpy(env.reset()) # to tensor if opt.use_gpu: state = state.cuda() done = True curr_step = 0 curr_episode = 0 if index == 0: interval = 100 #reward_hist = np.zeros(interval) reward_hist = deque(maxlen=100) #queue_rewards = queue.Queue(maxsize=interval) record_tag = False while True: if save: # Save trained model at save_interval if curr_episode % opt.save_interval == 0 and curr_episode > 0: torch.save(global_model.state_dict(), "{}/gfootball_{}".format(opt.saved_path, opt.layout)) if curr_episode%10==0: print("Process {}. Episode {} ".format(index, curr_episode)) curr_episode += 1 episode_reward = 0 # Synchronize thread-specific parameters theta'=theta and theta'_v=theta_v # (copy global params to local params (after every episode)) local_model.load_state_dict(global_model.state_dict(), strict=True) # Follow gradients only after 'done' (end of episode) if done: h_0 = torch.zeros((1, ACTOR_HIDDEN_SIZE), dtype=torch.float) c_0 = torch.zeros((1, CRITIC_HIDDEN_SIZE), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() if opt.use_gpu: h_0 = h_0.cuda() c_0 = c_0.cuda() log_policies = [] values = [] rewards = [] entropies = [] # Local steps for _ in range(opt.num_local_steps): curr_step += 1 # Model prediction from state. Returns two functions: # * Action prediction (Policy function) -> logits (array with every action-value) # * Value prediction (Value function) -> value (single value state-value) logits, value, h_0, c_0 = local_model(state, h_0, c_0) value = value.clamp(-1.,1.) lstm_model = False if lstm_model: # Lstm model returns data with one more dimension dim=1 else: dim=0 # Softmax over action-values policy = F.softmax(logits, dim=dim) # Log-softmax over action-values, to get the entropy of the policy log_policy = F.log_softmax(logits, dim=dim) #print('logits.size():', logits.size()) #print('value.size():', value.size()) #print('log_policy.size()',log_policy.size()) # Entropy acts as exploration rate entropy = -(policy * log_policy).sum(dim, keepdim=True) # From Async Methods for Deep RL: """ We also found that adding the entropy of the policy π to the objective function improved exploration by discouraging premature convergence to suboptimal deterministic poli- cies. This technique was originally proposed by (Williams & Peng, 1991), who found that it was particularly help- ful on tasks requiring hierarchical behavior.""" # We sample one action given the policy probabilities m = Categorical(policy) action = m.sample().item() # Perform action_t according to policy pi # Receive reward r_t and new state s_t+1 state, reward, done, _ = env.step(action) # state to tensor state = torch.from_numpy(state) episode_reward += reward if opt.use_gpu: state = state.cuda() # If last global step, reset episode if curr_step > opt.num_global_steps: done = True if done: curr_step = 0 state = torch.from_numpy(env.reset()) print("Process {:2.0f}. acumR: {} ".format(index, episode_reward)) if opt.use_gpu: state = state.cuda() # Save state-value, log-policy, reward and entropy of # every state we visit, to gradient-descent later values.append(value) if lstm_model: # Lstm model returns data with one more dimension log_policies.append(log_policy[0, action]) else: log_policies.append(log_policy[action]) rewards.append(reward) entropies.append(entropy) if done: # All local steps done. break # Save history every n episodes as statistics (just from one process) if index==0: #sample_size = 100 # hist_idx = (curr_episode - 1)%sample_size # if hist_idx==0: # reward_hist = np.zeros(sample_size) # reward_hist[hist_idx] = episode_reward reward_hist.append(episode_reward) if True:#hist_idx==sample_size-1: r_mean = np.mean(reward_hist) r_median = np.median(reward_hist) r_std = np.std(reward_hist) stand_median = (r_median - r_mean) / (r_std + 1e-9) writer.add_scalar("Process_{}/Last100Statistics_mean".format(index), r_mean, curr_episode) writer.add_scalar("Process_{}/Last100Statistics_median".format(index), r_median, curr_episode) writer.add_scalar("Process_{}/Last100Statistics_std".format(index), r_std, curr_episode) writer.add_scalar("Process_{}/Last100Statistics_stand_median".format(index), stand_median, curr_episode) # fin save history # Baseline rewards standarization over episode rewards. # Uncomment prints to see how rewards change #if index == 0: # only print first agent's process # print("Rewards before:", rewards) mean_rewards = np.mean(rewards) std_rewards = np.std(rewards) rewards = (rewards - mean_rewards) / (std_rewards + 1e-9) #if index == 0: # print("Rewards after:", rewards) # Initialize R/G_t: Discounted reward over local steps R = torch.zeros((1, 1), dtype=torch.float) if opt.use_gpu: R = R.cuda() if not done: _, R, _, _ = local_model(state, h_0, c_0) # Standardize this reward estimation too #mean_rewards = np.mean([R, rewards]) #std_rewards = np.std([R, rewards]) #R = (R - mean_rewards) / (std_rewards + 1e-9) # Simple value estimations between -1 and 1 R = R.clamp(-1.,1.) gae = torch.zeros((1, 1), dtype=torch.float) if opt.use_gpu: gae = gae.cuda() actor_loss = 0 critic_loss = 0 entropy_loss = 0 next_value = R # Gradiend descent over minibatch of local steps, from last to first step for value, log_policy, reward, entropy in list(zip(values, log_policies, rewards, entropies))[::-1]: # Generalized Advantage Estimator (GAE) gae = gae * opt.gamma * opt.tau gae = gae + reward + opt.gamma * next_value.detach() - value.detach() next_value = value # Accumulate discounted reward R = reward + opt.gamma * R # Accumulate gradients wrt parameters theta' actor_loss = actor_loss + log_policy * gae # Accumulate gradients wrt parameters theta'_v critic_loss = critic_loss + ((R - value)**2) / 2. entropy_loss = entropy_loss + entropy # Clamp critic loss value if too big #max_critic_loss = 1./opt.lr #critic_loss = critic_loss.clamp(-max_critic_loss, max_critic_loss) # Total process' loss total_loss = -actor_loss + critic_loss - opt.beta * entropy_loss # Clamp loss value if too big #max_loss = 2 * max_critic_loss #total_loss = total_loss.clamp(-max_loss, max_loss) # Saving logs for TensorBoard if index==0: writer.add_scalar("Process_{}/Total_Loss".format(index), total_loss, curr_episode) writer.add_scalar("Process_{}/Acum_Reward".format(index), episode_reward, curr_episode) #writer.add_scalar("actor_{}/Loss".format(index), -actor_loss, curr_episode) #writer.add_scalar("critic_{}/Loss".format(index), critic_loss, curr_episode) #writer.add_scalar("entropyxbeta_{}/Loss".format(index), opt.beta * entropy_loss, curr_episode) # Gradientes a cero optimizer.zero_grad() # Backward pass total_loss.backward() # Perform asynchronous update of theta and theta_v for local_param, global_param in zip(local_model.parameters(), global_model.parameters()): if global_param.grad is not None: # Shared params. No need to copy again. Updated on optimizer. break # First update to global_param global_param._grad = local_param.grad # Step en la direccion del gradiente, para los parametros GLOBALES optimizer.step() # Final del training if curr_episode == int(opt.num_global_steps / opt.num_local_steps): print("Training process {} terminated".format(index)) if index==0: writer.close() if save: end_time = timeit.default_timer() print('The code runs for %.2f s ' % (end_time - start_time)) return return