def local_test(index, opt, global_model): torch.manual_seed(123 + index) env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type) local_model = ActorCritic(num_states, num_actions) local_model.eval() state = torch.from_numpy(env.reset()) done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) while True: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) with torch.no_grad(): if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() logits, value, h_0, c_0 = local_model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, _ = env.step(action) env.render() actions.append(action) if curr_step > opt.num_global_steps or actions.count(actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state)
def test(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if opt.action_type == "right": actions = RIGHT_ONLY elif opt.action_type == "simple": actions = SIMPLE_MOVEMENT else: actions = COMPLEX_MOVEMENT env = create_train_env(opt.world, opt.stage, actions, "{}/video_{}_{}.mp4".format(opt.output_path, opt.world, opt.stage)) model = PPO(env.observation_space.shape[0], len(actions)) if torch.cuda.is_available(): model.load_state_dict(torch.load("{}/ppo_full_finished_{}_{}_2847".format(opt.saved_path, opt.world, opt.stage))) model.cuda() else: model.load_state_dict(torch.load("{}/ppo_full_finished_{}_{}_2847".format(opt.saved_path, opt.world, opt.stage), map_location=lambda storage, loc: storage)) model.eval() state = torch.from_numpy(env.reset()) while True: if torch.cuda.is_available(): state = state.cuda() logits, value = model(state) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, info = env.step(action) state = torch.from_numpy(state) env.render() print('x pos is ',info['x_pos']) if info["flag_get"]: print("World {} stage {} completed".format(opt.world, opt.stage)) break
def test(opt): torch.manual_seed(123) env, num_states, num_actions = create_train_env(opt.layout)#,"{}/video_{}.mp4".format(opt.output_path, opt.layout)) model = ActorCritic(num_states, num_actions) if torch.cuda.is_available(): model.load_state_dict(torch.load("{}/gym-pacman_{}".format(opt.saved_path,opt.layout))) model.cuda() else: model.load_state_dict(torch.load("{}/gym-pacman_{}".format(opt.saved_path, opt.layout), map_location=lambda storage, loc: storage)) model.eval() state = torch.from_numpy(env.reset()) done = True while True: if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) env.reset() else: h_0 = h_0.detach() c_0 = c_0.detach() if torch.cuda.is_available(): h_0 = h_0.cuda() c_0 = c_0.cuda() state = state.cuda() logits, value, h_0, c_0 = model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() action = int(action) state, reward, done, info = env.step(action) state = torch.from_numpy(state) env.render()
def main(): # 获取游戏 env = create_train_env(game="SuperMarioBros-Nes") print(env.observation_space.shape) print(env.action_space.n) obs = env.reset() while True: # 游戏生成的随机动作,int类型数值 action = env.action_space.sample() # 执行游戏 obs, reward, terminal, info = env.step(action) # 显示连续动作 obs = np.squeeze(obs) obses = obs[0] for i in range(1, obs.shape[0]): obses = np.hstack([obses, obs[i]]) cv2.imshow('obes', obses) cv2.waitKey(1) env.render() print("=" * 50) print("action:", action) print("obs shape:", obs.shape) print("reward:", reward) print("terminal:", terminal) print("info:", info) if terminal: obs = env.reset()
def train(opt): torch.manual_seed(123) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) if not os.path.isdir(opt.saved_path): os.makedirs(opt.saved_path) mp = _mp.get_context("spawn") env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type) global_model = ActorCritic(num_states, num_actions) if opt.use_gpu: global_model.cuda() global_model.share_memory() if opt.load_from_previous_stage: if opt.stage == 1: previous_world = opt.world - 1 previous_stage = 4 else: previous_world = opt.world previous_stage = opt.stage - 1 file_ = "{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, previous_world, previous_stage) if os.path.isfile(file_): global_model.load_state_dict(torch.load(file_)) optimizer = GlobalAdam(global_model.parameters(), lr=opt.lr) local_train(0, opt, global_model, optimizer, True)
def test(opt): torch.manual_seed(123) env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type, f"{opt.output_path}/video_{opt.world}_{opt.stage}.mp4") model = ActorCritic(num_states, num_actions) model.load_state_dict(torch.load(f"{opt.saved_path}/a3c_super_mario_bros_{opt.world}_{opt.stage}", map_location=lambda storage, loc: storage)) model.eval() state = torch.from_numpy(env.reset()) done = True while True: if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) env.reset() else: h_0 = h_0.detach() c_0 = c_0.detach() logits, value, h_0, c_0 = model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() action = int(action) state, reward, done, info = env.step(action) state = torch.from_numpy(state) env.render() if info["flag_get"]: print(f"World {opt.world} stage {opt.stage} completed") break
def train(opt): torch.manual_seed(123) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) if not os.path.isdir(opt.saved_path): os.makedirs(opt.saved_path) mp = _mp.get_context("spawn") env, num_states, num_actions = create_train_env(opt.layout) #global_model = ActorCritic(num_states, num_actions) global_model = AC_NN_MODEL(num_states, num_actions) if opt.use_gpu: global_model.cuda() global_model.share_memory() if opt.load_previous_weights: # if opt.stage == 1: # previous_world = opt.world - 1 # previous_stage = 4 # else: # previous_world = opt.world # previous_stage = opt.stage - 1 file_ = "{}/gym-pacman_{}".format(opt.saved_path, opt.layout) if os.path.isfile(file_): print("Loading previous weights for %s..." % opt.layout, end=" ") global_model.load_state_dict(torch.load(file_)) print("Done.") else: print("Can't load any previous weights for %s!" % opt.layout) # print("Loading some other map...", end=" ") # first_layout = "microGrid_superEasy1" # file_ = "{}/gym-pacman_{}".format(opt.saved_path, first_layout) # if os.path.isfile(file_): # global_model.load_state_dict(torch.load(file_)) # print("Done.") # else: # print("Failed.") #optimizer = GlobalAdam(global_model.parameters(), lr=opt.lr) optimizer = GlobalRMSProp(global_model.parameters(), lr=opt.lr) processes = [] for index in range(opt.num_processes): # Multiprocessing async agents if index == 0: process = mp.Process(target=local_train, args=(index, opt, global_model, optimizer, True)) else: process = mp.Process(target=local_train, args=(index, opt, global_model, optimizer)) process.start() processes.append(process) # Local test simulation #process = mp.Process(target=local_test, args=(opt.num_processes, opt, global_model)) #process.start() #processes.append(process) for process in processes: process.join()
def test(opt): torch.manual_seed(123) env, num_states, num_actions = create_train_env( opt.world, opt.stage, opt.action_type, "{}/video_{}_{}.mp4".format(opt.output_path, opt.world, opt.stage)) # env, num_states, num_actions = create_train_env(2, opt.stage, opt.action_type, # "{}/video_{}_{}.mp4".format(opt.output_path, 2, opt.stage)) model = ActorCritic(1, num_actions) if torch.cuda.is_available(): model_dict = torch.load("{}/a3c_super_mario_bros_{}_{}".format( opt.saved_path, opt.world, opt.stage)) model.load_state_dict(model_dict['net']) model.cuda() print("episode", model_dict['curr_episode']) print("time", model_dict['time']) else: model_dict = torch.load("{}/a3c_super_mario_bros_{}_{}".format( opt.saved_path, opt.world, opt.stage), map_location=lambda storage, loc: storage) model.load_state_dict(model_dict['net']) print("episode", model_dict['curr_episode']) print("time", model_dict['time']) model.eval() env.reset() tiles = SMB.get_tiles_num(env.unwrapped.ram) tiles = process_tiles(tiles) state = torch.from_numpy(tiles).unsqueeze(0).unsqueeze(0).float() done = True while True: if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) env.reset() else: h_0 = h_0.detach() c_0 = c_0.detach() if torch.cuda.is_available(): h_0 = h_0.cuda() c_0 = c_0.cuda() state = state.cuda() logits, value, h_0, c_0 = model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() action = int(action) state, reward, done, info = env.step(action) # print(reward) # print(reward, done, action) tiles = SMB.get_tiles_num(env.unwrapped.ram) tiles = process_tiles(tiles) state = torch.from_numpy(tiles).unsqueeze(0).unsqueeze(0).float() # print(done,info["flag_get"]) # print(reward) env.render() if info["flag_get"]: print("World {} stage {} completed".format(opt.world, opt.stage)) break
def eval(opt, global_model, num_states, num_actions): torch.manual_seed(123) if opt.action_type == "right": actions = RIGHT_ONLY elif opt.action_type == "simple": actions = SIMPLE_MOVEMENT else: actions = COMPLEX_MOVEMENT env = create_train_env(opt.world, opt.stage, actions) local_model = PPO(num_states, num_actions) if torch.cuda.is_available(): local_model.cuda() local_model.eval() state = torch.from_numpy(env.reset()) if torch.cuda.is_available(): state = state.cuda() done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) while True: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) logits, value = local_model(state) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, info = env.step(action) # Uncomment following lines if you want to save model whenever level is completed if info['flag_get'] == True: print( "############### The model is finished .saving the model ###############" ) torch.save( local_model.state_dict(), "{}/ppo_full_finished_{}_{}_{}".format(opt.saved_path, opt.world, opt.stage, opt.saved_episode)) exit() havedisplay = "DISPLAY" in os.environ if havedisplay: env.render() actions.append(action) if curr_step > opt.num_global_steps or actions.count( actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state) if torch.cuda.is_available(): state = state.cuda()
def local_test(index, opt, global_model, start_time, curr_episode): info = {} info["flag_get"] = False torch.manual_seed(123 + index) env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type) local_model = ActorCritic(num_states, num_actions) local_model.eval() state = torch.from_numpy(env.reset()) done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) while True and info["flag_get"] == False: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) with torch.no_grad(): if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() logits, value, h_0, c_0 = local_model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, _ = env.step(action) env.render() actions.append(action) if curr_step > opt.num_global_steps or actions.count( actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state) if info["flag_get"]: print("完成") end_time = timeit.default_timer() config_state = { 'net': global_model.state_dict(), 'curr_episode': curr_episode, 'time': end_time - start_time, } torch.save( config_state, "{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage)) return True else: env.close() return False
def train(opt): torch.manual_seed(123) # Prepare log directory if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) # Prepare saved models directory if not os.path.isdir(opt.saved_path): os.makedirs(opt.saved_path) # Prepare multiprocessing mp = _mp.get_context("spawn") # Create new training environment just to get number # of inputs and outputs to neural network _, num_states, num_actions = create_train_env(opt.layout) # Create Neural Network model global_model = AC_NN_MODEL(num_states, num_actions) if opt.use_gpu: global_model.cuda() # Share memory with processes for optimization later on global_model.share_memory() # Load trained agent weights if opt.load_previous_weights: file_ = "{}/gym-pacman_{}".format(opt.saved_path, opt.layout) if os.path.isfile(file_): print("Loading previous weights for %s..." % opt.layout, end=" ") global_model.load_state_dict(torch.load(file_)) print("Done.") else: print( "Can't load any previous weights for %s! Starting from scratch..." % opt.layout) # Define optimizer with shared weights. See 'optimizer.py' optimizer = GlobalAdam(global_model.parameters(), lr=opt.lr) # Create async processes processes = [] for index in range(opt.num_processes): # Multiprocessing async agents if index == 0: # Save weights to file only with this process process = mp.Process(target=local_train, args=(index, opt, global_model, optimizer, True)) else: process = mp.Process(target=local_train, args=(index, opt, global_model, optimizer)) process.start() processes.append(process) # Local test simulation (creates another model = more memory used) #process = mp.Process(target=local_test, args=(opt.num_processes, opt, global_model)) #process.start() #processes.append(process) for process in processes: process.join()
def eval(args, global_model, num_states, num_actions): # 固定初始化状态 if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) # 创建游戏动作 env = create_train_env(args.game) # 获取网络模型 local_model = PPO(num_states, num_actions) # 判断是否可以使用GPU if torch.cuda.is_available(): local_model.cuda() # 切换为评估状态 local_model.eval() # 将图像转换为Pytorch的数据类型 state = torch.from_numpy(env.reset()) # 一开始就更新模型参数 done = True curr_step = 0 max_reward = 0 while True: # 显示界面 if args.show_play: env.render() curr_step += 1 # 使用GPU计算 if torch.cuda.is_available(): state = state.cuda() # 每结束一次就更新模型参数 if done: local_model.load_state_dict(global_model.state_dict()) total_reward = 0 # 预测动作概率和评估值 logits, value = local_model(state) # 获取动作的序号 policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() # 执行游戏 state, reward, done, info = env.step(action) total_reward += reward # 重置游戏状态 if done: print("游戏得分:%f" % total_reward) curr_step = 0 state = env.reset() if max_reward < total_reward: torch.save( local_model.state_dict(), "{}/model_best_{}.pth".format(args.saved_path, args.game)) max_reward = total_reward # 转换每一步都游戏状态 state = torch.from_numpy(state)
def test(opt): torch.manual_seed(123) env, num_states, num_actions = create_train_env( opt.layout) #,"{}/video_{}.mp4".format(opt.output_path, opt.layout)) model = AC_NN_MODEL(num_states, num_actions) saved_model = "{}/gym-pacman_{}".format(opt.saved_path, opt.layout) print("Loading saved model: {}".format(saved_model)) if not os.path.isfile(saved_model): try: import urllib.request print('File not found, downloading saved model...') url = 'https://github.com/LecJackS/saved_models/blob/master/gym_pacman/gym-pacman_random_mnih2016-24hs?raw=true' file_name = "gym-pacman_random_mnih2016-24hs" urllib.request.urlretrieve( url, '{}/{}'.format(opt.saved_path, file_name)) print('Download done.') except: print("Something wrong happened, couldn't download model") if torch.cuda.is_available(): model.load_state_dict( torch.load("{}/gym-pacman_{}".format(opt.saved_path, opt.layout))) model.cuda() else: model.load_state_dict( torch.load("{}/gym-pacman_{}".format(opt.saved_path, opt.layout))) model.eval() state = torch.from_numpy(env.reset()) done = True game_count = 0 while game_count <= opt.num_games_to_play: if done: h_0 = torch.zeros((1, ACTOR_HIDDEN_SIZE), dtype=torch.float) c_0 = torch.zeros((1, CRITIC_HIDDEN_SIZE), dtype=torch.float) env.reset() game_count += 1 else: h_0 = h_0.detach() c_0 = c_0.detach() if torch.cuda.is_available(): h_0 = h_0.cuda() c_0 = c_0.cuda() state = state.cuda() logits, value, h_0, c_0 = model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() action = int(action) state, reward, done, info = env.step(action) state = torch.from_numpy(state) env.render()
def infer(args): # 固定初始化状态 if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) # 创建游戏环境 env = create_train_env(args.game) # 创建模型 model = PPO(env.observation_space.shape[0], env.action_space.n) # 加载模型参数文件 if torch.cuda.is_available(): model.load_state_dict( torch.load("{}/model_best_{}.pth".format(args.saved_path, args.game))) model.cuda() else: model.load_state_dict( torch.load("{}/model_best_{}.pth".format(args.saved_path, args.game), map_location=lambda storage, loc: storage)) # 切换评估模式 model.eval() # 获取刚开始的游戏图像 state = torch.from_numpy(env.reset()) total_reward = 0 while True: # 显示界面 env.render() # 使用GPU计算 if torch.cuda.is_available(): state = state.cuda() # 预测动作概率和评估值 logits, value = model(state) # 获取动作的序号 policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() # 执行游戏 state, reward, done, info = env.step(action) total_reward += reward # 转换每一步都游戏状态 state = torch.from_numpy(state) print(info) # 游戏通关 if done: print("游戏结束,得分:%f" % total_reward) break time.sleep(0.05) env.render(close=True) env.close()
def train(opt): torch.manual_seed(SEED) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) if not os.path.isdir(opt.saved_path): os.makedirs(opt.saved_path) mp = _mp.get_context("spawn") env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type) global_model = ActorCritic(num_states, num_actions) global_model.share_memory() if opt.load_from_previous_stage: if opt.stage == 1: previous_world = opt.world - 1 previous_stage = 4 else: previous_world = opt.world previous_stage = opt.stage - 1 file_ = f"{opt.saved_path}/a3c_super_mario_bros_{previous_world}_{previous_stage}" if os.path.isfile(file_): global_model.load_state_dict(torch.load(file_)) optimizer = GlobalAdam(global_model.parameters(), lr=opt.lr) processes = [] for index in range(opt.num_processes): if index == 0: process = mp.Process(target=local_train, args=(index, opt, global_model, optimizer, True)) else: process = mp.Process(target=local_train, args=(index, opt, global_model, optimizer)) process.start() processes.append(process) process = mp.Process(target=local_test, args=(opt.num_processes, opt, global_model)) process.start() processes.append(process) for process in processes: process.join()
def eval(opt, global_model, num_states, num_actions): torch.manual_seed(123) if opt.action_type == "right": actions = RIGHT_ONLY elif opt.action_type == "simple": actions = SIMPLE_MOVEMENT else: actions = COMPLEX_MOVEMENT env = create_train_env(opt.world, opt.stage, actions) local_model = PPO(num_states, num_actions) if torch.cuda.is_available(): local_model.cuda() local_model.eval() state = torch.from_numpy(env.reset()) if torch.cuda.is_available(): state = state.cuda() done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) while True: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) logits, value = local_model(state) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, info = env.step(action) # Uncomment following lines if you want to save model whenever level is completed if info["flag_get"]: # if random.randint(0, 10)%2 == 0: # print("Finished") torch.save( local_model.state_dict(), "{}/ppo_super_mario_bros_{}_{}_{}".format( opt.saved_path, opt.world, opt.stage, curr_step)) # return # env.render() actions.append(action) if curr_step > opt.num_global_steps or actions.count( actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state) if torch.cuda.is_available(): state = state.cuda()
def local_test(index, opt, global_model): torch.manual_seed(42 + index) env, num_states, num_actions = create_train_env(opt.layout, index=index) local_model = AC_NN_MODEL(num_states, num_actions) # Test model we are going to test (turn off dropout, no backward pass) local_model.eval() state = torch.from_numpy(env.reset()) done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) while True: curr_step += 1 if done: # Copy global model to local model local_model.load_state_dict(global_model.state_dict(), strict=False) with torch.no_grad(): if done: h_0 = torch.zeros((1, ACTOR_HIDDEN_SIZE), dtype=torch.float) c_0 = torch.zeros((1, CRITIC_HIDDEN_SIZE), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() logits, value, h_0, c_0 = local_model(state, h_0, c_0) # Simple estimation: between(-1,1) value = value.clamp(-1., 1.) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, _ = env.step(action) # render as seen by NN, but with colors render_miniature = True if render_miniature: env.render(mode='human', id=index) actions.append(action) if curr_step > opt.num_global_steps or actions.count( actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state)
def test(opt): #torch.manual_seed(123) if not os.path.isdir(opt.output_path): os.makedirs(opt.output_path) env, num_states, num_actions = create_train_env(1, opt, "{}/test.mp4".format(opt.output_path)) model = ActorCritic(num_states, num_actions) if opt.use_gpu and torch.cuda.is_available(): model.load_state_dict(torch.load("{}/a3c".format(opt.resume_path))) model.cuda() else: model.load_state_dict(torch.load("{}/a3c".format(opt.resume_path), map_location=lambda storage, loc: storage)) model.eval() state = torch.from_numpy(env.reset(False, False, True)) round_done, stage_done, game_done = False, False, True num_action = 0 while True: if round_done or stage_done or game_done: h_0 = torch.zeros((1, 256), dtype=torch.float) c_0 = torch.zeros((1, 256), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() if torch.cuda.is_available(): h_0 = h_0.cuda() c_0 = c_0.cuda() state = state.cuda() logits, value, h_0, c_0 = model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() action = int(action) num_action += 1 state, reward, round_done, stage_done, game_done = env.step(action) state = torch.from_numpy(state) if round_done or stage_done: state = torch.from_numpy(env.reset(round_done, stage_done, game_done)) if game_done or num_action == opt.max_steps: env.make_anim() print("Game over") break
def test(opt, global_model, num_states, num_actions): torch.manual_seed(123) env = create_train_env(opt.level) local_model = PPO(num_states, num_actions) if torch.cuda.is_available(): local_model.cuda() local_model.eval() state = torch.from_numpy(env.reset()) if torch.cuda.is_available(): state = state.cuda() done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) while True: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) logits, value = local_model(state) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, info = env.step(action) if (done and info["lives"] != 0) or info["level"] == opt.level: torch.save( local_model.state_dict(), "{}/ppo_contra_success_{}".format(opt.saved_path, info["lives"])) env.render() actions.append(action) if curr_step > opt.num_max_steps or actions.count( actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state) if torch.cuda.is_available(): state = state.cuda()
def test(opt): torch.manual_seed(123) env, num_states, num_actions = create_train_env( 1, "{}/video.mp4".format(opt.output_path)) model = ActorCritic(num_states, num_actions) if torch.cuda.is_available(): model.load_state_dict( torch.load("{}/a3c_street_fighter".format(opt.saved_path))) model.cuda() else: model.load_state_dict( torch.load("{}/a3c_street_fighter".format(opt.saved_path), map_location=lambda storage, loc: storage)) model.eval() state = torch.from_numpy(env.reset(False, False, True)) round_done, stage_done, game_done = False, False, True while True: if round_done or stage_done or game_done: h_0 = torch.zeros((1, 1024), dtype=torch.float) c_0 = torch.zeros((1, 1024), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() if torch.cuda.is_available(): h_0 = h_0.cuda() c_0 = c_0.cuda() state = state.cuda() logits, value, h_0, c_0 = model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() action = int(action) state, reward, round_done, stage_done, game_done = env.step(action) state = torch.from_numpy(state) if round_done or stage_done: state = torch.from_numpy( env.reset(round_done, stage_done, game_done)) if game_done: print("Game over") break
def aa_test(opt): torch.manual_seed(123) print("{}/video_{}_{}.mp4".format(opt.output_path, opt.world, opt.stage)) env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type, "{}/video_{}_{}.mp4".format(opt.output_path, opt.world, opt.stage)) model = ActorCritic(num_states, num_actions) if torch.cuda.is_available(): print("{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage)) model.load_state_dict(torch.load("{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage))) model.cuda() else: model.load_state_dict(torch.load("{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage), map_location=lambda storage, loc: storage)) model.eval() state = torch.from_numpy(env.reset()) done = True while True: if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) env.reset() else: h_0 = h_0.detach() c_0 = c_0.detach() if torch.cuda.is_available(): h_0 = h_0.cuda() c_0 = c_0.cuda() state = state.cuda() logits, value, h_0, c_0 = model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() action = int(action) state, reward, done, info = env.step(action) state = torch.from_numpy(state) env.render() # time.sleep(0.1) if info["flag_get"]: print("World {} stage {} completed".format(opt.world, opt.stage)) break
def test(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) env = create_train_env(opt.zone, opt.act, output_path="{}/video_{}.mp4".format( opt.output_path, STATES["{}-{}".format(opt.zone, opt.act)])) model = PPO(env.observation_space.shape[0], len(ACTION_MAPPING)) if torch.cuda.is_available(): model.load_state_dict( torch.load("{}/PPO_SonicTheHedgehog_{}".format( opt.saved_path, STATES["{}-{}".format(opt.zone, opt.act)]))) model.cuda() else: model.load_state_dict( torch.load("{}/PPO_SonicTheHedgehog_{}".format( opt.saved_path, STATES["{}-{}".format(opt.zone, opt.act)]), map_location=lambda storage, loc: storage)) model.eval() state = torch.from_numpy(env.reset()) while True: if torch.cuda.is_available(): state = state.cuda() logits, value = model(state) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, info = env.step(action) state = torch.from_numpy(state) env.render() if done and info["act"] == opt.act: print("Map {} is completed".format(STATES["{}-{}".format( opt.zone, opt.act)])) break
def local_train(index, opt, global_model, optimizer, save=False): torch.manual_seed(42 + index) if save: start_time = timeit.default_timer() if index == 0: # Path for tensorboard log process_log_path = "{}/process-{}".format(opt.log_path, index) writer = SummaryWriter( process_log_path) #, max_queue=1000, flush_secs=10) # Creates training environment for this particular process env, num_states, num_actions = create_train_env(opt.layout, index=index) # local_model keeps local weights for each async process local_model = AC_NN_MODEL(num_states, num_actions) if opt.use_gpu: local_model.cuda() # Tell the model we are going to use it for training local_model.train() # env.reset and get first state state = env.reset() #state = torch.from_numpy(env.reset()) if opt.use_gpu: state = state.cuda() done = True curr_step = 0 curr_episode = 0 # Keep track of min/max Gt and Actor Loss to clamp Critic and Actor max_Gt = 3. max_AL = 1. if index == 0: interval = 100 #reward_hist = np.zeros(interval) reward_hist = deque(maxlen=100) #queue_rewards = queue.Queue(maxsize=interval) record_tag = False while True: if save: # Save trained model at save_interval if curr_episode % opt.save_interval == 0 and curr_episode > 0: torch.save( global_model.state_dict(), "{}/gym-pacman_{}".format(opt.saved_path, opt.layout)) if curr_episode % 10 == 0: print("Process {}. Episode {} ".format(index, curr_episode)) curr_episode += 1 episode_reward = 0 # Synchronize thread-specific parameters theta'=theta and theta'_v=theta_v # (copy global params to local params (after every episode)) local_model.load_state_dict(global_model.state_dict(), strict=True) # Follow gradients only after 'done' (end of episode) if done: h_0 = torch.zeros((1, ACTOR_HIDDEN_SIZE), dtype=torch.float) c_0 = torch.zeros((1, CRITIC_HIDDEN_SIZE), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() if opt.use_gpu: h_0 = h_0.cuda() c_0 = c_0.cuda() log_policies = [] values = [] rewards = [] entropies = [] # Local steps for _ in range(opt.num_local_steps): curr_step += 1 # Decay max_Gt over time to adjust to present Gt scale max_Gt = max_Gt * 0.99999 # Model prediction from state. Returns two functions: # * Action prediction (Policy function) -> logits (array with every action-value) # * Value prediction (Value function) -> value (single value state-value) logits, value, h_0, c_0 = local_model(state, h_0, c_0) # Simple estimation: between(-1,1) #value = value.clamp(min_Gt, max_Gt) # Softmax over action-values policy = F.softmax(logits, dim=1) # Log-softmax over action-values, to get the entropy of the policy log_policy = F.log_softmax(logits, dim=1) #print('0. policy----------: \n', policy) #print('1. logits----------: \n', logits) #print('2. log_policy------: \n', log_policy) # Entropy acts as exploration rate entropy = -(policy * log_policy).sum(1, keepdim=True) # From Async Methods for Deep RL: """ We also found that adding the entropy of the policy π to the objective function improved exploration by discouraging premature convergence to suboptimal deterministic poli- cies. This technique was originally proposed by (Williams & Peng, 1991), who found that it was particularly help- ful on tasks requiring hierarchical behavior.""" # We sample one action given the policy probabilities m = Categorical(policy) action = m.sample().item() # Perform action_t according to policy pi # Receive reward r_t and new state s_t+1 state, reward, done, _ = env.step(action) reward = reward / max_Gt episode_reward += reward if opt.record and index == 0: #save animation for each four-frame input save_image(state.permute(1, 0, 2, 3), filename='./snaps/process{}-{}.png'.format( index, curr_step), nrow=1) #,normalize=True) # Preprocess state: #state = preproc_state(np_state) # state to tensor #state = torch.from_numpy(state) # Render as seen by NN, but with colors if index < opt.num_processes_to_render: env.render(mode='human') if opt.use_gpu: state = state.cuda() # If last global step, reset episode if curr_step > opt.num_global_steps: done = True if done: curr_step = 0 state = env.reset() #state = preproc_state(np_state) print("Process {:2.0f}. acumR: {} ".format( index, episode_reward)) if opt.use_gpu: state = state.cuda() # Save state-value, log-policy, reward and entropy of # every state we visit, to gradient-descent later values.append(value) log_policies.append(log_policy[0, action]) rewards.append(reward) entropies.append(entropy) if done: # All local steps done. break # Save history every n episodes as statistics (just from one process) if index == 0: reward_hist.append(episode_reward) if True: #hist_idx==sample_size-1: r_mean = np.mean(reward_hist) r_median = np.median(reward_hist) r_std = np.std(reward_hist) stand_median = (r_median - r_mean) / (r_std + 1e-9) writer.add_scalar("Process_{}/Last100_mean".format(index), r_mean, curr_episode) writer.add_scalar("Process_{}/Last100_median".format(index), r_median, curr_episode) writer.add_scalar("Process_{}/Last100_std".format(index), r_std, curr_episode) writer.add_scalar( "Process_{}/Last100_stand_median".format(index), stand_median, curr_episode) # Normalize Rewards #mean_rewards = np.mean(rewards) #std_rewards = np.std(rewards) #rewards = (rewards - mean_rewards) / (std_rewards + 1e-9) # Initialize R/G_t: Discounted reward over local steps R = torch.zeros((1, 1), dtype=torch.float) if opt.use_gpu: R = R.cuda() if not done: _, R, _, _ = local_model(state, h_0, c_0) # Simple state-value estimation: between(-30, 30) #R = R.clamp(min_Gt, max_Gt) gae = torch.zeros((1, 1), dtype=torch.float) if opt.use_gpu: gae = gae.cuda() actor_loss = 0 critic_loss = 0 entropy_loss = 0 next_value = R # Gradiend descent over minibatch of local steps, from last to first step for value, log_policy, reward, entropy in list( zip(values, log_policies, rewards, entropies))[::-1]: # Generalized Advantage Estimator (GAE) gae = gae * opt.gamma * opt.tau gae = gae + reward + opt.gamma * next_value.detach( ) - value.detach() next_value = value # Accumulate discounted reward R = reward + opt.gamma * R # For normalization/clamp max_Gt = max(max_Gt, abs(R.detach().item())) # Accumulate gradients wrt parameters theta' #print('log_policy:', log_policy) #print('gae:', gae) actor_loss = actor_loss + log_policy * gae #print('actor_loss:', actor_loss) # For normalization/clamp max_AL = max(max_AL, abs(actor_loss.detach().item())) # Accumulate gradients wrt parameters theta'_v critic_loss = critic_loss + ((R - value)**2) / 2. entropy_loss = entropy_loss + entropy # Update and keep track of (min_Gt, max_Gt) for Critic range # as an exponential cummulative average #max_Gt = 0.495*max_Gt + 0.505*(max(1, R.item())-max_Gt)/(curr_episode) # Total process' loss #print('actor_loss',actor_loss) #print('critic_loss',critic_loss) #print('entropy_loss',opt.beta * entropy_loss) # Make sure that max update is about 1.0 (lr * critic_loss)<1, # so updates to weights are not excesive. # ie: lr=1e-4; max critic_loss == 1/1e-4 = 1e4 = 10000 # lr*loss == 0.0001*10000 == 1 (close to 1) critic_loss = critic_loss # Normalize actor loss actor_loss = actor_loss #max_AL # 3.*actor_loss funca bien con critic_loss sin modificar #print('actor_loss final:', actor_loss) total_loss = -actor_loss + critic_loss - opt.beta * entropy_loss # Saving logs for TensorBoard if index == 0: writer.add_scalar("Process_{}/Total_Loss".format(index), total_loss, curr_episode) writer.add_scalar("Process_{}/actor_Loss".format(index), -actor_loss, curr_episode) writer.add_scalar("Process_{}/critic_Loss".format(index), critic_loss, curr_episode) writer.add_scalar("Process_{}/entropy_Loss".format(index), -opt.beta * entropy_loss, curr_episode) writer.add_scalar("Process_{}/Acum_Reward".format(index), episode_reward, curr_episode) writer.add_scalar("Process_{}/max_Gt".format(index), max_Gt, curr_episode) writer.add_scalar("Process_{}/max_AL".format(index), max_Gt, curr_episode) writer.add_scalar("Process_{}/Gt".format(index), R, curr_episode) #writer.add_scalar("actor_{}/Loss".format(index), -actor_loss, curr_episode) #writer.add_scalar("critic_{}/Loss".format(index), critic_loss, curr_episode) #writer.add_scalar("entropyxbeta_{}/Loss".format(index), opt.beta * entropy_loss, curr_episode) # Gradientes a cero optimizer.zero_grad() # Backward pass total_loss.backward() # Perform asynchronous update of theta and theta_v for local_param, global_param in zip(local_model.parameters(), global_model.parameters()): if global_param.grad is not None: # Shared params. No need to copy again. Updated on optimizer. break # First update to global_param global_param._grad = local_param.grad # Step en la direccion del gradiente, para los parametros GLOBALES optimizer.step() # Final del training if curr_episode == int(opt.num_global_steps / opt.num_local_steps): print("Training process {} terminated".format(index)) if index == 0: writer.close() if save: end_time = timeit.default_timer() print('The code runs for %.2f s ' % (end_time - start_time)) return return
def evaluate(opt, global_model, num_states, num_actions): torch.manual_seed(123) if opt.action_type == "right": actions = RIGHT_ONLY elif opt.action_type == "simple": actions = SIMPLE_MOVEMENT else: actions = COMPLEX_MOVEMENT savefile = opt.saved_path + '/PPO_test.csv' print(savefile) title = ['Steps', 'Time', 'TotalReward', "Flag"] with open(savefile, 'w', newline='') as sfile: writer = csv.writer(sfile) writer.writerow(title) print(opt.retina_resolution) env = create_train_env(actions, mp_wrapper=False, cortex_left=opt.cortex_left, cortex_right=opt.cortex_right, retina_resolution=opt.retina_resolution, use_retina=opt.retina) local_model = PPO(num_states, num_actions) if torch.cuda.is_available(): local_model.cuda() local_model.eval() state = torch.from_numpy(env.reset()) if torch.cuda.is_available(): state = state.cuda() done = True curr_step = 0 tot_step = 0 actions = deque(maxlen=opt.max_actions) tot_reward = 0 got_flag = 0 index = 0 while True: start_time = time.time() curr_step += 1 tot_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) logits, value = local_model(state) policy = F.softmax(logits, dim=1) action = torch.argmax( policy).item() # This selects the best action to take state, reward, done, info = env.step(action) # im1 = state[0, 0, :, :] # im2 = state[0, 1, :, :] # im3 = state[0, 2, :, :] # im4 = state[0, 3, :, :] # res1 = cv2.resize(im1, dsize=(370, 370), interpolation=cv2.INTER_CUBIC) # im2 = state[0, 1, :, :] # res2 = cv2.resize(im2, dsize=(370, 370), interpolation=cv2.INTER_CUBIC) # im3 = state[0, 2, :, :] # res3 = cv2.resize(im2, dsize=(370, 370), interpolation=cv2.INTER_CUBIC) # im4 = state[0, 3, :, :] # res4 = cv2.resize(im2, dsize=(370, 370), interpolation=cv2.INTER_CUBIC) # fig=plt.figure(figsize=(8, 8)) # columns = 2 # rows = 2 # fig.add_subplot(rows, columns, 1) # plt.imshow(im1) # fig.add_subplot(rows, columns, 2) # plt.imshow(im2) # fig.add_subplot(rows, columns, 3) # plt.imshow(im3) # fig.add_subplot(rows, columns, 4) # plt.imshow(im4) # plt.show() index += 1 tot_reward += reward # Uncomment following lines if you want to save model whenever level is completed if flag_get(info): print("Evaluate: Level Completed!") got_flag = 1 done = True torch.save( local_model.state_dict(), "{}/ppo_super_mario_bros_{}".format(opt.saved_path, curr_step)) # env.render() actions.append(action) if curr_step > opt.num_global_steps or actions.count( actions[0]) == actions.maxlen: # print("Evaluate: Time's up!") done = True if done: # print("Evaluate: Done!") ep_time = time.time() - start_time data = [ tot_step, "{:.4f}".format(ep_time), "{:.2f}".format(tot_reward), got_flag ] with open(savefile, 'a', newline='') as sfile: writer = csv.writer(sfile) writer.writerows([data]) curr_step = 0 got_flag = 0 tot_reward = 0 actions.clear() # time.sleep(10) # Sleep for 10 secs state = env.reset() state = torch.from_numpy(state) if torch.cuda.is_available(): state = state.cuda()
def local_train(index, opt, global_model, optimizer, save=False): # torch.manual_seed(123 + index) if save: start_time = timeit.default_timer() # writer = SummaryWriter(opt.log_path) if not opt.saved_path: if opt.game == "Supermario": saved_path = "{}_{}_{}_{}".format(opt.game, opt.num_sequence, opt.internal_reward, opt.world, opt.stage) else: saved_path = "{}_{}".format(opt.game, opt.num_sequence) else: saved_path = opt.saved_path if opt.game == "Supermario": env, num_states, num_actions = create_train_env( opt.world, opt.stage, opt.action_type, opt.final_step) else: env, num_states, num_actions = create_train_env_atari(opt.game, saved_path, output_path=None) local_model = ActorCritic_seq(num_states, num_actions, opt.num_sequence) if opt.use_gpu: local_model.cuda() local_model.train() state = torch.from_numpy(env.reset()) if opt.use_gpu: state = state.cuda() done = True curr_step = 0 curr_episode = 0 loss_matrix = [] Cum_reward1 = [] SCORE1 = [] X1 = [] Num_interaction1 = [] if opt.game == "Supermario": env1, num_states, num_actions = create_train_env( opt.world, opt.stage, opt.action_type, opt.final_step) else: env1, num_states, num_actions = create_train_env_atari( opt.game, saved_path, output_path=None) local_model1 = ActorCritic_seq(num_states, num_actions, opt.num_sequence) if opt.use_gpu: local_model1.cuda() local_model1.eval() Cum_reward2 = [] SCORE2 = [] X2 = [] Num_interaction2 = [] if opt.game == "Supermario": env2, num_states, num_actions = create_train_env( opt.world, opt.stage, opt.action_type, opt.final_step) else: env2, num_states, num_actions = create_train_env_atari( opt.game, saved_path, output_path=None) local_model2 = ActorCritic_seq(num_states, num_actions, opt.num_sequence) if opt.use_gpu: local_model2.cuda() local_model2.eval() Cum_reward3 = [] SCORE3 = [] X3 = [] Num_interaction3 = [] if opt.game == "Supermario": env3, num_states, num_actions = create_train_env( opt.world, opt.stage, opt.action_type, opt.final_step) else: env3, num_states, num_actions = create_train_env_atari( opt.game, saved_path, output_path=None) local_model3 = ActorCritic_seq(num_states, num_actions, opt.num_sequence) if opt.use_gpu: local_model3.cuda() local_model3.eval() while True: if save: if curr_episode % opt.save_interval == 0 and curr_episode > 0: if opt.game == 'Supermario': # torch.save(global_model.state_dict(), # "{}/a3c_seq_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage)) torch.save(global_model.state_dict(), saved_path + "/trained_model") else: torch.save(global_model.state_dict(), saved_path + "/trained_model") # print("Process {}. Episode {}".format(index, curr_episode),done) if curr_episode % opt.log_interval == 0: if opt.game == 'Supermario': # local_model1.load_state_dict(global_model.state_dict()) # Cum_reward1,X1,Num_interaction1,x_arrive_all_pro = local_test_iter(opt,env1,local_model1,Cum_reward1,X1,Num_interaction1,save) # local_model2.load_state_dict(global_model.state_dict()) # Cum_reward2,SCORE2,X2,Num_interaction2,x_arrive_all_pro = local_test_iter(opt,env2,local_model2,Cum_reward2,SCORE2,X2,Num_interaction2,videosave=False,action_max=False,gate_max=False) local_model2.load_state_dict(global_model.state_dict()) Cum_reward2, SCORE2, X2, Num_interaction2, x_arrive_all_max = local_test_iter( opt, env2, local_model2, Cum_reward2, SCORE2, X2, Num_interaction2, videosave=False, action_max=True, gate_max=True) local_model3.load_state_dict(global_model.state_dict()) Cum_reward3, SCORE3, X3, Num_interaction3, x_arrive_actionpro_gatemax = local_test_iter( opt, env3, local_model3, Cum_reward3, SCORE3, X3, Num_interaction3, videosave=False, action_max=False, gate_max=True) print(curr_episode, x_arrive_all_max, x_arrive_actionpro_gatemax) else: local_model1.load_state_dict(global_model.state_dict()) Cum_reward1, SCORE1, X1, Num_interaction1, x_arrive_all_pro = local_test_iter( opt, env1, local_model1, Cum_reward1, SCORE1, X1, Num_interaction1, videosave=False, action_max=False, gate_max=False) print(curr_episode, x_arrive_all_pro) curr_episode += 1 local_model.load_state_dict(global_model.state_dict()) # g_0_cnt = 0 if done: g_0_ini = torch.ones((1)) h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) g_0 = torch.zeros((1, opt.num_sequence), dtype=torch.float) cum_r = 0 g_0_cnt = 0 else: h_0 = h_0.detach() c_0 = c_0.detach() # g_0 = g_0.detach() if opt.use_gpu: h_0 = h_0.cuda() c_0 = c_0.cuda() g_0_ini = g_0_ini.cuda() g_0 = g_0.cuda() log_policies = [] log_gates = [] values = [] rewards = [] reward_internals = [] entropies = [] for aaaaa in range(opt.num_local_steps): curr_step += 1 g_pre = g_0 g_pre_cnt = g_0_cnt logits, value, h_0, c_0, g_0, g_0_cnt, gate_flag1, gate_flag2 = local_model( state, h_0, c_0, g_0, g_0_ini) policy = F.softmax(logits, dim=1) log_policy = F.log_softmax(logits, dim=1) entropy = -(policy * log_policy).sum(1, keepdim=True) m = Categorical(policy) action = m.sample().item() state, reward, raw_reward, done, info = env.step(action) reward_internal = reward if g_0_ini == 1: log_gate = torch.zeros((), dtype=torch.float) if opt.use_gpu: log_gate = log_gate.cuda() elif gate_flag1: # log_gate = log_gate log_gate = torch.zeros((), dtype=torch.float) elif gate_flag2: # log_gate = log_gate + torch.log(1-g_pre[0,g_pre_cnt]) log_gate = torch.log(1 - g_pre[0, g_pre_cnt]) else: # log_gate = log_gate+torch.log(g_0[0,g_0_cnt-1]) log_gate = torch.log(g_0[0, g_0_cnt - 1]) if reward > 0: reward_internal = reward + opt.internal_reward g_0_ini = torch.zeros((1)) if opt.use_gpu: g_0_ini = g_0_ini.cuda() # if save: # env.render() # print(reward) # time.sleep(1) state = torch.from_numpy(state) if opt.use_gpu: state = state.cuda() if curr_step > opt.num_global_steps: done = True print('max glabal step achieve') if done: curr_step = 0 env.reset() if opt.start_initial == 'random': for i in range(opt.start_interval): state, reward, _, done, info = env.step( env.action_space.sample()) if done: env.reset() state = torch.from_numpy(state) else: state = torch.from_numpy(env.reset()) if opt.use_gpu: state = state.cuda() values.append(value) log_policies.append(log_policy[0, action]) log_gates.append(log_gate) rewards.append(reward) reward_internals.append(reward_internal) entropies.append(entropy) cum_r += reward if done: break # print(log_policies,log_gates) R = torch.zeros((1, 1), dtype=torch.float) if opt.use_gpu: R = R.cuda() if not done: _, R, _, _, _, _, gate_flag1, gate_flag2 = local_model( state, h_0, c_0, g_0, g_0_ini, gate_update=False) gae = torch.zeros((1, 1), dtype=torch.float) if opt.use_gpu: gae = gae.cuda() actor_loss = 0 critic_loss = 0 entropy_loss = 0 # next_value = R # for value, log_policy, log_gate, reward, reward_internal, entropy in list(zip(values, log_policies, log_gates, rewards,reward_internals, entropies))[::-1]: # gae = gae * opt.gamma * opt.tau # gae = gae + reward_internal + opt.gamma * next_value.detach() - value.detach() # next_value = value # actor_loss = actor_loss + (log_policy+log_gate) * gae # R = R * opt.gamma + reward # critic_loss = critic_loss + (R - value) ** 2 / 2 # entropy_loss = entropy_loss + entropy # estimate internal reward directly if not (gate_flag1 or gate_flag2): if R > 0: R = R + opt.internal_reward next_value = R for value, log_policy, log_gate, reward, reward_internal, entropy in list( zip(values, log_policies, log_gates, rewards, reward_internals, entropies))[::-1]: gae = gae * opt.gamma * opt.tau gae = gae + reward_internal + opt.gamma * next_value.detach( ) - value.detach() next_value = value actor_loss = actor_loss + (log_policy + log_gate) * gae R = R * opt.gamma + reward_internal critic_loss = critic_loss + (R - value)**2 / 2 entropy_loss = entropy_loss + entropy # estimate external reward # next_value = R # for value, log_policy, log_gate, reward, reward_internal, entropy in list(zip(values, log_policies, log_gates, rewards,reward_internals, entropies))[::-1]: # gae = gae * opt.gamma * opt.tau # gae = gae + reward_internal-0.01* + opt.gamma * next_value.detach() - value.detach() # next_value = value # actor_loss = actor_loss + (log_policy+log_gate) * gae # R = R * opt.gamma + reward # critic_loss = critic_loss + (R - value) ** 2 / 2 # entropy_loss = entropy_loss + entropy if opt.value_loss_coef: total_loss = -actor_loss + critic_loss * opt.value_loss_coef - opt.beta * entropy_loss else: total_loss = -actor_loss + critic_loss - opt.beta * entropy_loss # writer.add_scalar("Train_{}/Loss".format(index), total_loss, curr_episode) optimizer.zero_grad() total_loss.backward(retain_graph=True) if opt.max_grad_norm: torch.nn.utils.clip_grad_norm_(local_model.parameters(), opt.max_grad_norm) loss_matrix.append(total_loss.detach().cpu().numpy()) if curr_episode % opt.save_interval == 0: # print('aaaaaaaaaaa',X,Cum_reward) if opt.game == 'Supermario': np.save(saved_path + "/X1{}".format(index), X1) np.save(saved_path + "/X2{}".format(index), X2) np.save(saved_path + "/X3{}".format(index), X3) np.save(saved_path + "/loss{}".format(index), loss_matrix) np.save(saved_path + "/Cum_reward1{}".format(index), Cum_reward1) np.save(saved_path + "/SCORE1{}".format(index), SCORE1) np.save(saved_path + "/Num_interaction1{}".format(index), Num_interaction1) np.save(saved_path + "/Cum_reward2{}".format(index), Cum_reward2) np.save(saved_path + "/SCORE2{}".format(index), SCORE2) np.save(saved_path + "/Num_interaction2{}".format(index), Num_interaction2) np.save(saved_path + "/Cum_reward3{}".format(index), Cum_reward3) np.save(saved_path + "/SCORE3{}".format(index), SCORE3) np.save(saved_path + "/Num_interaction3{}".format(index), Num_interaction3) for local_param, global_param in zip(local_model.parameters(), global_model.parameters()): if global_param.grad is not None: break global_param._grad = local_param.grad optimizer.step() if curr_episode == int(opt.num_global_steps / opt.num_local_steps): print("Training process {} terminated".format(index)) if save: end_time = timeit.default_timer() print('The code runs for %.2f s ' % (end_time - start_time)) return
def local_train(index, opt, global_model, optimizer, save=False): torch.manual_seed(123 + index) info = {} info["flag_get"] = False if save: start_time = timeit.default_timer() writer = SummaryWriter(opt.log_path) env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type) local_model = ActorCritic(num_states, num_actions) if opt.use_gpu: local_model.cuda() local_model.train() state = torch.from_numpy(env.reset()) if opt.use_gpu: state = state.cuda() done = True curr_step = 0 curr_episode = 0 # while True: while True: if save: # if curr_episode % opt.save_interval == 0 and curr_episode > 0: # torch.save(global_model.state_dict(), # "{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage)) print("Process {}. Episode {}".format(index, curr_episode)) curr_episode += 1 local_model.load_state_dict(global_model.state_dict()) if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() if opt.use_gpu: h_0 = h_0.cuda() c_0 = c_0.cuda() log_policies = [] values = [] rewards = [] entropies = [] for _ in range(opt.num_local_steps): curr_step += 1 logits, value, h_0, c_0 = local_model(state, h_0, c_0) policy = F.softmax(logits, dim=1) log_policy = F.log_softmax(logits, dim=1) entropy = -(policy * log_policy).sum(1, keepdim=True) m = Categorical(policy) action = m.sample().item() state, reward, done, info = env.step(action) state = torch.from_numpy(state) if opt.use_gpu: state = state.cuda() if curr_step > opt.num_global_steps: done = True if done: curr_step = 0 state = torch.from_numpy(env.reset()) if opt.use_gpu: state = state.cuda() values.append(value) log_policies.append(log_policy[0, action]) rewards.append(reward) entropies.append(entropy) if done: break R = torch.zeros((1, 1), dtype=torch.float) if opt.use_gpu: R = R.cuda() if not done: _, R, _, _ = local_model(state, h_0, c_0) gae = torch.zeros((1, 1), dtype=torch.float) if opt.use_gpu: gae = gae.cuda() actor_loss = 0 critic_loss = 0 entropy_loss = 0 next_value = R for value, log_policy, reward, entropy in list( zip(values, log_policies, rewards, entropies))[::-1]: gae = gae * opt.gamma * opt.tau gae = gae + reward + opt.gamma * next_value.detach( ) - value.detach() next_value = value actor_loss = actor_loss + log_policy * gae R = R * opt.gamma + reward critic_loss = critic_loss + (R - value)**2 / 2 entropy_loss = entropy_loss + entropy total_loss = -actor_loss + critic_loss - opt.beta * entropy_loss writer.add_scalar("Train_{}/Loss".format(index), total_loss, curr_episode) optimizer.zero_grad() total_loss.backward() for local_param, global_param in zip(local_model.parameters(), global_model.parameters()): if global_param.grad is not None: break global_param._grad = local_param.grad optimizer.step() if curr_episode == int(opt.num_global_steps / opt.num_local_steps): print("Training process {} terminated".format(index)) if save: end_time = timeit.default_timer() print('The code runs for %.2f s ' % (end_time - start_time)) return if curr_episode % opt.save_interval == 0: # if info["flag_get"]: if local_test(opt.num_processes, opt, global_model, start_time, curr_episode): break
def local_test_certain(index, opt, global_model): torch.manual_seed(123 + index) if opt.game == "Supermario": env, num_states, num_actions = create_train_env( opt.world, opt.stage, opt.action_type, opt.final_step) else: if not opt.saved_path: saved_path = "{}_{}_{}_{}".format(opt.game, opt.num_sequence, opt.internal_reward, opt.lr) env, num_states, num_actions = create_train_env_atari(opt.game, saved_path, output_path=None) local_model = ActorCritic_seq(num_states, num_actions, opt.num_sequence) local_model.eval() done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) Cum_reward = [] X = [] i = 0 while True: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) with torch.no_grad(): if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) g_0_ini = torch.ones((1)) state = torch.from_numpy(env.reset()) cum_r = 0 g_0 = torch.zeros((1, opt.num_sequence), dtype=torch.float) score = 0 else: h_0 = h_0.detach() c_0 = c_0.detach() logits, value, h_0, c_0, g_0, g_0_cnt, gate_flag, _ = local_model( state, h_0, c_0, g_0, g_0_ini, certain=True) #print(g_0,g_0_cnt) g_0_ini = torch.zeros((1)) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, raw_reward, done, info = env.step(action) score += raw_reward # env.render() actions.append(action) if curr_step > opt.num_global_steps or actions.count( actions[0]) == actions.maxlen: done = True cum_r = cum_r + reward if done: i = i + 1 curr_step = 0 actions.clear() state = env.reset() if opt.game == "Supermario": x = info['x_pos'] else: x = score print(i, 'test_certain', x) X.append(x) Cum_reward.append(cum_r) state = torch.from_numpy(state) if i % 100 == 0: np.save("{}/Cum_reward_test_certain".format(opt.saved_path), Cum_reward) np.save("{}/X_test_certain".format(opt.saved_path), X)
def local_train(index, opt, global_model, optimizer, save=False): torch.manual_seed(123 + index) if save: start_time = timeit.default_timer() # Path for tensorboard log process_log_path = "{}/process-{}".format(opt.log_path, index) writer = SummaryWriter(process_log_path) #, max_queue=1000, flush_secs=10) # Creates training environment for this particular process env, num_states, num_actions = create_train_env(opt.layout, index=index) # local_model keeps local weights for each async process local_model = AC_NN_MODEL(num_states, num_actions) if opt.use_gpu: local_model.cuda() # Tell the model we are going to use it for training local_model.train() # env.reset and get first state state = torch.from_numpy(env.reset()) if opt.use_gpu: state = state.cuda() done = True curr_step = 0 curr_episode = 0 while True: if save: # Save trained model at save_interval if curr_episode % opt.save_interval == 0 and curr_episode > 0: torch.save( global_model.state_dict(), "{}/gym-pacman_{}".format(opt.saved_path, opt.layout)) print("Process {}. Episode {} ".format(index, curr_episode), end="\r") curr_episode += 1 # Synchronize thread-specific parameters theta'=theta and theta'_v=theta_v # (copy global params to local params (after every episode)) local_model.load_state_dict(global_model.state_dict()) # Follow gradients only after 'done' (end of episode) if done: h_0 = torch.zeros((1, ACTOR_HIDDEN_SIZE), dtype=torch.float) c_0 = torch.zeros((1, CRITIC_HIDDEN_SIZE), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() if opt.use_gpu: h_0 = h_0.cuda() c_0 = c_0.cuda() log_policies = [] values = [] rewards = [] entropies = [] # Local steps for _ in range(opt.num_local_steps): curr_step += 1 # Model prediction from state. Returns two functions: # * Action prediction (Policy function) -> logits (array with every action-value) # * Value prediction (Value function) -> value (single value state-value) logits, value, h_0, c_0 = local_model(state, h_0, c_0) # Softmax over action-values policy = F.softmax(logits, dim=1) # Log-softmax over action-values, to get the entropy of the policy log_policy = F.log_softmax(logits, dim=1) # Entropy acts as exploration rate entropy = -(policy * log_policy).sum(1, keepdim=True) # From Async Methods for Deep RL: """ We also found that adding the entropy of the policy π to the objective function improved exploration by discouraging premature convergence to suboptimal deterministic poli- cies. This technique was originally proposed by (Williams & Peng, 1991), who found that it was particularly help- ful on tasks requiring hierarchical behavior.""" # We sample one action given the policy probabilities m = Categorical(policy) action = m.sample().item() # Perform action_t according to policy pi # Receive reward r_t and new state s_t+1 state, reward, done, _ = env.step(action) # Render as seen by NN, but with colors if index < opt.num_processes_to_render: env.render(mode='human', id=index) # state to tensor state = torch.from_numpy(state) if opt.use_gpu: state = state.cuda() # If last local step, reset episode if curr_step > opt.num_global_steps: done = True if done: curr_step = 0 state = torch.from_numpy(env.reset()) if opt.use_gpu: state = state.cuda() # Save state-value, log-policy, reward and entropy of # every state we visit, to gradient-descent later values.append(value) log_policies.append(log_policy[0, action]) rewards.append(reward) entropies.append(entropy) if done: # All local steps done. break # Baseline rewards standarization over episode rewards. # Uncomment prints to see how rewards change # Should I #if index == 0: # print("Rewards before:", rewards) mean_rewards = np.mean(rewards) std_rewards = np.std(rewards) rewards = (rewards - mean_rewards) / (std_rewards + 1e-9) #if index == 0: # print("Rewards after:", rewards) # Initialize R/G_t: Discounted reward over local steps R = torch.zeros((1, 1), dtype=torch.float) if opt.use_gpu: R = R.cuda() if not done: _, R, _, _ = local_model(state, h_0, c_0) # Standarize this reward estimation too #mean_rewards = np.mean([R, rewards]) #std_rewards = np.std([R, rewards]) R = (R - mean_rewards) / (std_rewards + 1e-9) gae = torch.zeros((1, 1), dtype=torch.float) if opt.use_gpu: gae = gae.cuda() actor_loss = 0 critic_loss = 0 entropy_loss = 0 next_value = R # Gradiend descent over minibatch of local steps, from last to first step for value, log_policy, reward, entropy in list( zip(values, log_policies, rewards, entropies))[::-1]: # Generalized Advantage Estimator (GAE) gae = gae * opt.gamma * opt.tau gae = gae + reward + opt.gamma * next_value.detach( ) - value.detach() next_value = value # Accumulate discounted reward R = reward + opt.gamma * R # Accumulate gradients wrt parameters theta' actor_loss = actor_loss + log_policy * gae # Accumulate gradients wrt parameters theta'_v critic_loss = critic_loss + ((R - value)**2) / 2. entropy_loss = entropy_loss + entropy # Clamp critic loss value if too big max_critic_loss = 1. / opt.lr critic_loss = critic_loss.clamp(-max_critic_loss, max_critic_loss) # Total process' loss total_loss = -actor_loss + critic_loss - opt.beta * entropy_loss # Clamp loss value if too big max_loss = 2 * max_critic_loss total_loss = total_loss.clamp(-max_loss, max_loss) # Saving logs for TensorBoard writer.add_scalar("Total_{}/Loss".format(index), total_loss, curr_episode) #writer.add_scalar("actor_{}/Loss".format(index), -actor_loss, curr_episode) #writer.add_scalar("critic_{}/Loss".format(index), critic_loss, curr_episode) #writer.add_scalar("entropyxbeta_{}/Loss".format(index), opt.beta * entropy_loss, curr_episode) # Gradientes a cero optimizer.zero_grad() # Backward pass total_loss.backward() # Perform asynchronous update of theta and theta_v for local_param, global_param in zip(local_model.parameters(), global_model.parameters()): if global_param.grad is not None: # Shared params. No need to copy again. Updated on optimizer. break # First update to global_param global_param._grad = local_param.grad # Step en la direccion del gradiente, para los parametros GLOBALES optimizer.step() # Final del training if curr_episode == int(opt.num_global_steps / opt.num_local_steps): print("Training process {} terminated".format(index)) writer.close() if save: end_time = timeit.default_timer() print('The code runs for %.2f s ' % (end_time - start_time)) return return
def local_train(index, opt, global_model, global_icm, optimizer, save=False): torch.manual_seed(123 + index) if save: start_time = timeit.default_timer() writer = SummaryWriter(opt.log_path) env, num_states, num_actions = create_train_env(index + 1) local_model = ActorCritic(num_states, num_actions) local_icm = IntrinsicCuriosityModule(num_states, num_actions) if opt.use_gpu: local_model.cuda() local_icm.cuda() local_model.train() local_icm.train() inv_criterion = nn.CrossEntropyLoss() fwd_criterion = nn.MSELoss() state = torch.from_numpy(env.reset(False, False, True)) if opt.use_gpu: state = state.cuda() round_done, stage_done, game_done = False, False, True curr_step = 0 curr_episode = 0 while True: if save: if curr_episode % opt.save_interval == 0 and curr_episode > 0: torch.save(global_model.state_dict(), "{}/a3c_street_fighter".format(opt.saved_path)) torch.save(global_icm.state_dict(), "{}/icm_street_fighter".format(opt.saved_path)) curr_episode += 1 local_model.load_state_dict(global_model.state_dict()) if round_done or stage_done or game_done: h_0 = torch.zeros((1, 1024), dtype=torch.float) c_0 = torch.zeros((1, 1024), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() if opt.use_gpu: h_0 = h_0.cuda() c_0 = c_0.cuda() log_policies = [] values = [] rewards = [] entropies = [] inv_losses = [] fwd_losses = [] for _ in range(opt.num_local_steps): curr_step += 1 logits, value, h_0, c_0 = local_model(state, h_0, c_0) policy = F.softmax(logits, dim=1) log_policy = F.log_softmax(logits, dim=1) entropy = -(policy * log_policy).sum(1, keepdim=True) m = Categorical(policy) action = m.sample().item() next_state, reward, round_done, stage_done, game_done = env.step( action) next_state = torch.from_numpy(next_state) if opt.use_gpu: next_state = next_state.cuda() action_oh = torch.zeros((1, num_actions)) # one-hot action action_oh[0, action] = 1 if opt.use_gpu: action_oh = action_oh.cuda() pred_logits, pred_phi, phi = local_icm(state, next_state, action_oh) if opt.use_gpu: inv_loss = inv_criterion(pred_logits, torch.tensor([action]).cuda()) else: inv_loss = inv_criterion(pred_logits, torch.tensor([action])) fwd_loss = fwd_criterion(pred_phi, phi) / 2 intrinsic_reward = opt.eta * fwd_loss.detach() reward += intrinsic_reward if curr_step > opt.num_global_steps: round_done, stage_done, game_done = False, False, True if round_done or stage_done or game_done: curr_step = 0 next_state = torch.from_numpy( env.reset(round_done, stage_done, game_done)) if opt.use_gpu: next_state = next_state.cuda() values.append(value) log_policies.append(log_policy[0, action]) rewards.append(reward) entropies.append(entropy) inv_losses.append(inv_loss) fwd_losses.append(fwd_loss) state = next_state if round_done or stage_done or game_done: break R = torch.zeros((1, 1), dtype=torch.float) if opt.use_gpu: R = R.cuda() if not (round_done or stage_done or game_done): _, R, _, _ = local_model(state, h_0, c_0) gae = torch.zeros((1, 1), dtype=torch.float) if opt.use_gpu: gae = gae.cuda() actor_loss = 0 critic_loss = 0 entropy_loss = 0 curiosity_loss = 0 next_value = R for value, log_policy, reward, entropy, inv, fwd in list( zip(values, log_policies, rewards, entropies, inv_losses, fwd_losses))[::-1]: gae = gae * opt.gamma * opt.tau gae = gae + reward + opt.gamma * next_value.detach( ) - value.detach() next_value = value actor_loss = actor_loss + log_policy * gae R = R * opt.gamma + reward critic_loss = critic_loss + (R - value)**2 / 2 entropy_loss = entropy_loss + entropy curiosity_loss = curiosity_loss + (1 - opt.beta) * inv + opt.beta * fwd total_loss = opt.lambda_ * (-actor_loss + critic_loss - opt.sigma * entropy_loss) + curiosity_loss writer.add_scalar("Train_{}/Loss".format(index), total_loss, curr_episode) if save: print("Process {}. Episode {}. Loss: {}".format( index, curr_episode, total_loss)) optimizer.zero_grad() total_loss.backward() for local_param, global_param in zip(local_model.parameters(), global_model.parameters()): if global_param.grad is not None: break global_param._grad = local_param.grad for local_param, global_param in zip(local_icm.parameters(), global_icm.parameters()): if global_param.grad is not None: break global_param._grad = local_param.grad optimizer.step() if curr_episode == int(opt.num_global_steps / opt.num_local_steps): print("Training process {} terminated".format(index)) if save: end_time = timeit.default_timer() print('The code runs for %.2f s ' % (end_time - start_time)) return
def test(opt): viewer = rendering.SimpleImageViewer() viewer.width = 800 * 2 viewer.height = 600 * 2 #1920x1080 viewer.window = pyglet.window.Window(width=viewer.width, height=viewer.height, resizable=True) torch.manual_seed(123) if opt.output_path != None: env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type, "{}/video_{}_{}.mp4".format(opt.output_path, opt.world, opt.stage)) else: env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type,None) model = ActorCritic(num_states, num_actions) if torch.cuda.is_available(): model.load_state_dict(torch.load("{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage))) model.cuda() else: model.load_state_dict(torch.load("{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage), map_location=lambda storage, loc: storage)) model.eval() state = torch.from_numpy(env.reset()) done = True max_x_pos = 0 max_x_pos_counter = 0 while True: if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) print('done') max_x_pos = 0 max_x_pos_counter = 0 env.reset() done = False else: h_0 = h_0.detach() c_0 = c_0.detach() if torch.cuda.is_available(): h_0 = h_0.cuda() c_0 = c_0.cuda() state = state.cuda() logits, value, h_0, c_0 = model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() action = int(action) state, reward, done, info = env.step(action) rgb = env.render('rgb_array') state = torch.from_numpy(state) viewer.imshow(rgb) if max_x_pos_counter < 50: time.sleep(0.06) if reward < 0: max_x_pos_counter += 1 if max_x_pos_counter > 150: print('no progress, stopping') done = True if info["flag_get"]: print("World {} stage {} completed".format(opt.world, opt.stage)) done = True copyfile("{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage), "{}/a3c_super_mario_bros_{}_{}_{}".format(opt.saved_path, info["world"], info["stage"],random.random())) print(reward,COMPLEX_MOVEMENT[action]) print('done testing')