def test(rank, args, shared_model): torch.manual_seed(args.seed + rank) env = gym.make(args.env_name) env.seed(args.seed + rank) model = ActorCritic(1, env.action_space) model.eval() state = env.reset() state = E.process_frame42(state) state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, logit, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(logit) action = prob.max(1)[1].data.numpy() state, reward, done, _ = env.step(action[0, 0]) state = E.process_frame42(state) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() state = E.process_frame42(state) time.sleep(60) state = torch.from_numpy(state)
def test(n_episodes=5, name='LunarLander_ONE.pth'): env = gym.make('LunarLander-v2') policy = ActorCritic() policy.load_state_dict(torch.load('./preTrained/{}'.format(name))) render = True save_gif = False for i_episode in range(1, n_episodes + 1): state = env.reset() running_reward = 0 for t in range(10000): action = policy(state) state, reward, done, _ = env.step(action) running_reward += reward if render: env.render() if save_gif: img = env.render(mode='rgb_array') img = Image.fromarray(img) img.save('./gif/{}.jpg'.format(t)) if done: break print('Episode {}\tReward: {}'.format(i_episode, running_reward)) env.close()
def test(rank, args, shared_model, counter): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) if done and counter.value > args.max_steps: test_final(shared_model, env, args) save_model(shared_model, args) exit() with torch.no_grad(): value, logit = model(state.unsqueeze(0)) prob = F.softmax(logit, dim=-1) action = prob.max(1, keepdim=True)[1].numpy() state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: print( "Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) state = torch.from_numpy(state)
def test(rank, params, shared_model): torch.manual_seed(params.seed + rank) # asynchronizing the test agent env = create_atari_env(params.env_name, video=True) # running an environment with a video env.seed(params.seed + rank) # asynchronizing the environment model = ActorCritic(env.observation_space.shape[0], env.action_space) # creating one model model.eval() # putting the model in "eval" model because it won't be trained state = env.reset() # getting the input images as numpy arrays state = torch.from_numpy(state) # converting them into torch tensors reward_sum = 0 # initializing the sum of rewards to 0 done = True # initializing done to True start_time = time.time() # getting the starting time to measure the computation time actions = deque(maxlen=100) # cf https://pymotw.com/2/collections/deque.html episode_length = 0 # initializing the episode length to 0 while True: # repeat episode_length += 1 # incrementing the episode length by one if done: # synchronizing with the shared model (same as train.py) model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, action_value, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(action_value) action = prob.max(1)[1].data.numpy() # the test agent does not explore, it directly plays the best action state, reward, done, _ = env.step(action[0, 0]) # done = done or episode_length >= params.max_episode_length reward_sum += reward if done: # printing the results at the end of each part print("Time {}, episode reward {}, episode length {}".format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) reward_sum = 0 # reinitializing the sum of rewards episode_length = 0 # reinitializing the episode length actions.clear() # reinitializing the actions state = env.reset() # reinitializing the environment time.sleep(60) # doing a one minute break to let the other agents practice (if the game is done) state = torch.from_numpy(state) # new state and we continue
def test(rank, args, shared_model, counter): torch.manual_seed(args.seed + rank) torch.save(shared_model.state_dict(), 't.pkl') env = Env(args.seed + rank) model = ActorCritic(1, env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True # env.visual() start_time = time.time() # a quick hack to prevent the agent from stucking actions = deque(maxlen=500) episode_length = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) with torch.no_grad(): value, logit = model((state.unsqueeze(0)).type(torch.FloatTensor)) prob = F.softmax(logit, dim=-1) action = prob.max(1, keepdim=True)[1].numpy() print(action) state, reward, done = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: print("Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length)) # env.visual() reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) state = torch.from_numpy(state)
def run(args): device = torch.device("cpu") env = gym.make('SpaceInvaders-v0') state_size = env.observation_space.shape action_size = env.action_space.n model = ActorCritic([1, 4, 84, 84], action_size).to(device) opt = SharedRMSprop(model.parameters(), lr=args.lr, alpha=args.alpha, eps=1e-8, weight_decay=args.weight_decay, momentum=args.momentum, centered=False) opt_lock = mp.Lock() scheduler = LRScheduler(args) if args.load_fp: checkpoint = torch.load(args.load_fp) model.load_state_dict(checkpoint['model_state_dict']) opt.load_state_dict(checkpoint['optimizer_state_dict']) if args.train: start = time.time() model.share_memory() model.train() step_counter, max_reward, ma_reward, ma_loss = [ mp.Value('d', 0.0) for _ in range(4) ] processes = [] if args.num_procs == -1: args.num_procs = mp.cpu_count() for rank in range(args.num_procs): p = mp.Process(target=train, args=(rank, args, device, model, opt, opt_lock, scheduler, step_counter, max_reward, ma_reward, ma_loss)) p.start() processes.append(p) for p in processes: p.join() if args.verbose > 0: print(f"Seconds taken: {time.time() - start}") if args.save_fp: torch.save( { 'model_state_dict': model.state_dict(), # 'optimizer_state_dict': opt.state_dict(), }, args.save_fp) if args.test: model.eval() test(args, device, model)
def test(shared_model, render=0): # torch.manual_seed(rank) env = create_atari_env(args.rom) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 cx = hx = None while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True) hx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, logit, (hx, cx) = model((Variable( state.unsqueeze(0).type(FloatTensor), volatile=True), (hx, cx))) prob = F.softmax(logit) # print logit.data.numpy() action = prob.max(1, keepdim=True)[1].data.cpu().numpy() state, reward, done, _ = env.step(action[0, 0]) if render == 1: env.render() time.sleep(0.03) done = done or episode_length >= 10000 reward_sum += reward # a quick hack to prevent the agent from stucking # actions.append(action[0, 0]) # if actions.count(actions[0]) == actions.maxlen: # done = True if done: print("Time {}, episode reward {}, episode length {}". format(get_elapsed_time_str(), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) state = torch.from_numpy(state)
def test(rank, params, shared_model): torch.manual_seed(params.seed + rank) # Test ajanını asenkron yapmak için env = create_atari_env(params.env_name, video=True) # Ortamı video ile oynatmak için env.seed(params.seed + rank) # Ortamı asenkron yapmak için model = ActorCritic(env.observation_space.shape[0], env.action_space) # Modelin oluşturulması model.eval() # Modelin eğitim yapmaması için state = env.reset() # input resmini numpy array olarak alıyoruz. state = torch.from_numpy(state) # Bunu torch tensörüne çeviriyoruz. reward_sum = 0 done = True start_time = time.time() # Başlangıç zamanı actions = deque(maxlen=100) # https://pymotw.com/2/collections/deque.html episode_length = 0 while True: episode_length += 1 # Bölüm uzunluğunu birer birer arttırıyoruz. if done: # Eğitim modundaki gibi paylaşımlı model ile senkronize hale getiriyoruz. model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, action_value, (hx, cx) = model( (Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(action_value) action = prob.max(1)[1].data.numpy( ) # Test Ajanı keşif yapmadan doğrudan en iyi aksiyonu kullanarak oynu oynar. state, reward, done, _ = env.step(action[ 0, 0]) # done = done or episode_length >= params.max_episode_length reward_sum += reward if done: # Her bölümün sonunda sonucu yazdırır. print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) # Öbür ajanları beklemek için 1 dk beklemesi için. state = torch.from_numpy( state) # Yeni durum (state) oluşturup devam eder.
def test(rank, args, model_path, all_cooked_time, all_cooked_bw, all_vp_time, all_vp_unit, num): torch.manual_seed(args.seed + rank) env = Environment(args, all_cooked_time, all_cooked_bw, all_vp_time, all_vp_unit, random_seed=args.seed + rank) model = ActorCritic() model.load_state_dict(torch.load(model_path)) model.eval() state = env.reset() state_time = time.time() episode_length = 0 # log = open('new-result-1/test-vp-log20000.txt', 'w') # log = open('results-3/log20000.txt', 'w') # log = open('train_norway_result-2/test_log3000.txt', 'w') log = open('result-1/log-' + str(num) + '.txt', 'w') while True: episode_length += 1 state = Variable(torch.FloatTensor(state)) # print('state', state) logit, value = model(state.view(-1, 11, 8)) prob = F.softmax(logit, dim=1) _, action = torch.max(prob, 1) state, reward, done, (action, vp_quality, ad_quality, out_quality, rebuf, cv, blank_ratio, reward, real_vp_bitrate, smooth) \ = env.step(action.data.numpy()[0]) update = True if update: print("Time {}, action {}, ({},{},{}), bitrate {:.3f}, rebuf {:.3f}, cv {:.3f}, smooth {:.3f}, reward {:.3f}, episode {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - state_time)), action, vp_quality, ad_quality, out_quality, real_vp_bitrate, rebuf, cv, smooth, reward, episode_length)) log.write('action: ' + str(action) + ' (' + str(vp_quality) + ',' + str(ad_quality) + ',' + str(out_quality) + ') rebuf: ' + str(rebuf) + ' cv: ' + str(cv) + ' bitrate: ' + str(real_vp_bitrate) + ' smooth: ' + str(smooth) + ' reward: ' + str(reward) + ' episode: ' + str(episode_length) + '\n') # log.write(str()) # print('Time {}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - state_time)))) # print('time: ', time.gmtime(time.time() - state_time)) # time.sleep(0.5) if done: state = env.reset() if episode_length == 50000: log.close() break
def test(rank, params, shared_model): torch.manual_seed(params.seed + rank) env = create_atari_env(params.env_name, video=True) env.seed(params.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 if done: save(model, 'brain.pkl') model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, action_value, (hx, cx) = model( (Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(action_value) action = prob.max(1)[1].data.numpy() state, reward, done, _ = env.step(action[0]) reward_sum += reward if done: f = open("Statistics.txt", 'a') f.write(str(reward_sum) + " " + str(episode_length) + "\n") f.close() print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) state = torch.from_numpy(state)
class A2C: def __init__(self, state_dim, action_dim, cfg): self.gamma = cfg.gamma self.model = ActorCritic(state_dim, action_dim, hidden_dim=cfg.hidden_dim).to(cfg.device) self.optimizer = optim.Adam(self.model.parameters(), lr=cfg.lr) self.device = cfg.device self.loss = 0 self.env = cfg.env def choose_action(self, state): state = torch.tensor([state], device=self.device, dtype=torch.float32) dist, value = self.model(state) action = dist.sample().item() return action, value, dist def update(self, values, next_values, step_rewards, log_probs, mask_dones, entropy): # 利用一回合数据进行更新 expected_values = [] advantages = [] actor_losses = [] critic_losses = [] for step in range(len(step_rewards)): expected_values.append(step_rewards[step].item() + self.gamma * next_values[step].squeeze().item() * mask_dones[step].squeeze().item()) advantages.append(expected_values[step] - values[step].item()) actor_losses.append(-advantages[step] * log_probs[step].item()) critic_losses.append(nn.MSELoss()(values[step].squeeze(), torch.tensor([expected_values[step]]).to(self.device)).cpu().detach().numpy()) actor_loss = mean(actor_losses) critic_loss = mean(critic_losses) self.loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy self.optimizer.zero_grad() self.loss.backward() self.optimizer.step() def save(self, path): model_checkpoint = os.path.join(path, self.env+'actor_critic.pt') torch.save(self.model.state_dict(), model_checkpoint) print('Model Saved!') def load(self, path): model_checkpoint = os.path.join(path, self.env+'actor_critic.pt') self.model.load_state_dict(torch.load(model_checkpoint)) print('Model Loaded!')
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) net = ActorCritic(num_inputs, num_actions) net.load_state_dict(torch.load(args.save_path + 'model.pth')) net.to(device) net.eval() running_score = 0 steps = 0 for e in range(5): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: env.render() steps += 1 policy, value = net(state) action = get_action(policy, num_actions) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) score += reward state = next_state print('{} episode | score: {:.2f}'.format(e, score))
def test(rank, params, shared_model): torch.manual_seed(params.seed + rank) env = create_atari_env(params.env_name, video=True) env.seed(params.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, action_value, (hx, cx) = model( (Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(action_value) action = prob.max(1)[1].data.numpy() state, reward, done, _ = env.step(action[0, 0]) reward_sum += reward if done: print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep( 60 ) # 60 seconds break to allow the other agents to test the environment state = torch.from_numpy(state)
def local_test(index, opt, global_model): torch.manual_seed(123 + index) env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type) local_model = ActorCritic(num_states, num_actions) local_model.eval() state = torch.from_numpy(env.reset()) done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) while True: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) with torch.no_grad(): if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() logits, value, h_0, c_0 = local_model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, _ = env.step(action) env.render() actions.append(action) if curr_step > opt.num_global_steps or actions.count( actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state)
class Agent(mp.Process): def __init__(self, global_actor_critic, optimizer, input_dims, nb_actions, gamma, lr, name, global_ep_index, env_id): super(Agent, self).__init__() self.local_actor_critic = ActorCritic(input_dims, nb_actions, gamma) self.global_actor_critic = global_actor_critic self.name = "w%02i" % name self.episode_index = global_ep_index self.env = gym.make(env_id) self.optimizer = optimizer def run(self): t_step = 1 while self.episode_index.value < EPISODES: done = False observation = self.env.reset() score = 0 self.local_actor_critic.clear_memory() while not done: action = self.local_actor_critic.choose_action(observation) observation_, reward, done, info = self.env.step(action) score += reward self.local_actor_critic.remember(observation, action, reward) if (t_step % T_MAX) == 0 or done: loss = self.local_actor_critic.calc_loss(done) self.optimizer.zero_grad() loss.backward() for local_param, global_param in zip( self.local_actor_critic.parameters(), self.global_actor_critic.parameters()): global_param._grad = local_param.grad self.optimizer.step() self.local_actor_critic.load_state_dict(self.global_actor_critic.state_dict()) self.local_actor_critic.clear_memory() t_step += 1 observation = observation_ with self.episode_index.get_lock(): self.episode_index.value += 1 print(self.name, 'episode ', self.episode_index.value, 'reward %.1f' % score)
class Agent: def __init__(self): self.net = ActorCritic() self.net.load_state_dict( torch.load('models/good.pt', map_location='cpu')) self.net.eval() torch.no_grad().__enter__() # 关闭梯度记录 def brain(self, reversi: Reversi, who: int) -> Coordinate: # assert reversi.next == who state = torch.Tensor(getBoardState(reversi)).unsqueeze(0) policy = self.net(state)[1][0] # 保证位置合法性 for y, x in itertools.product(range(SIZE), repeat=2): if not reversi.good[y][x]: policy[y * SIZE + x] = 0. else: policy[y * SIZE + x] += 1e-8 # 防止概率全为 0 action = policy.max(dim=-1).indices.item() return (action // SIZE, action % SIZE)
def load_checkpoint(filepath): # checkpoint = torch.load(filepath) # model = checkpoint['model'] # model.load_state_dict(checkpoint['state_dict']) # for parameter in model.parameters(): # parameter.requires_grad = False # model.eval() ##################### model = ActorCritic(len(state), params.output_space) optimizer = my_optim.SharedAdam(model.parameters(), lr=params.lr) checkpoint = torch.load(params.file_path_shared_model) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) model.eval() model_test = ActorCritic(len(state), params.output_space) optimizer_test = my_optim.SharedAdam(model_test.parameters(), lr=params.lr) checkpoint = torch.load(params.file_path_shared_model_test) model_test.load_state_dict(checkpoint['state_dict']) optimizer_test.load_state_dict(checkpoint['optimizer']) model_test.eval() ########################### return model
def train(rank, args, shared_model, counter, lock, optimizer=None): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() avg_rew_win_size = 25 avg_rew = 0 state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() avg_rew_cnt = 0 # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 while True: # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) else: cx = cx.detach() hx = hx.detach() values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): episode_length += 1 value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx))) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) state, reward, done, _ = env.step(action.numpy()) done = done or episode_length >= args.max_episode_length reward_sum += reward reward = max(min(reward, 1), -1) # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True with lock: counter.value += 1 if done: avg_rew = avg_rew + reward_sum if avg_rew_cnt % avg_rew_win_size == 0: print(" avg. episode reward {}".format(avg_rew / avg_rew_win_size)) avg_rew = 0 print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) episode_length = 0 reward_sum = 0 actions.clear() state = env.reset() avg_rew_cnt = avg_rew_cnt + 1 state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((state.unsqueeze(0), (hx, cx))) R = value.detach() values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1] - values[i] gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * gae.detach() - args.entropy_coef * entropies[i] optimizer.zero_grad() (policy_loss + args.value_loss_coef * value_loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
def train(rank, args, T, shared_model, shared_average_model, optimiser): torch.manual_seed(args.seed + rank) # CUDA if args.use_cuda: torch.cuda.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) gpu_id = 0 if args.use_cuda else -1 # todo 0 代表第一个显卡 if gpu_id >= 0: model = model.cuda() model.train() if not args.on_policy: # Normalise memory capacity by number of training processes memory = EpisodicReplayMemory( args.memory_capacity // args.num_processes, args.max_episode_length) t = 1 # Thread step counter done = True # Start new episode while T.value() <= args.T_max: # On-policy episode loop while True: # Sync with shared model at least every t_max steps if gpu_id >= 0: with torch.cuda.device(gpu_id): model.load_state_dict(shared_model.state_dict()) else: model.load_state_dict(shared_model.state_dict()) # Get starting timestep t_start = t # Reset or pass on hidden state if done: avg_hx = torch.zeros(1, args.hidden_size) avg_cx = torch.zeros(1, args.hidden_size) if gpu_id >= 0: with torch.cuda.device(gpu_id): hx = torch.zeros(1, args.hidden_size).cuda() cx = torch.zeros(1, args.hidden_size).cuda() else: hx = torch.zeros(1, args.hidden_size) cx = torch.zeros(1, args.hidden_size) # Reset environment and done flag state = state_to_tensor(env.reset()) if gpu_id >= 0: state = state.cuda() done, episode_length = False, 0 else: # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call) hx = hx.detach() cx = cx.detach() # Lists of outputs for training policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], [] while not done and t - t_start < args.t_max: # Calculate policy and values policy, Q, V, (hx, cx) = model(state, (hx, cx)) # shared 模型在 CPU上, 需要转换 if gpu_id >= 0: to_avg_state = state.cpu() else: to_avg_state = state average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( to_avg_state, (avg_hx, avg_cx)) # if gpu_id >= 0: # average_policies = average_policies.cuda() # Sample action action = torch.multinomial(policy, 1)[0, 0] # Step next_state, reward, done, _ = env.step(action.item()) next_state = state_to_tensor(next_state) if gpu_id >= 0: next_state = next_state.cuda() reward = args.reward_clip and min(max( reward, -1), 1) or reward # Optionally clamp rewards done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter if not args.on_policy: # Save (beginning part of) transition for offline training memory.append(state, action, reward, policy.detach()) # Save just tensors # Save outputs for online training [ arr.append(el) for arr, el in zip(( policies, Qs, Vs, actions, rewards, average_policies), (policy, Q, V, torch.LongTensor([[action]]), torch.Tensor([[reward]]), average_policy)) ] # Increment counters t += 1 T.increment() # Update state state = next_state # Break graph for last values calculated (used for targets, not directly as model outputs) if done: # Qret = 0 for terminal s Qret = torch.zeros(1, 1) if not args.on_policy: # Save terminal state for offline training memory.append(state, None, None, None) else: # Qret = V(s_i; θ) for non-terminal s _, _, Qret, _ = model(state, (hx, cx)) Qret = Qret.detach().cpu() # Train the network on-policy if gpu_id >= 0: Qs = list(map(lambda x: x.cpu(), Qs)) Vs = list(map(lambda x: x.cpu(), Vs)) policies = list(map(lambda x: x.cpu(), policies)) _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies) # Finish on-policy episode if done: break # Train the network off-policy when enough experience has been collected if not args.on_policy and len(memory) >= args.replay_start: # Sample a number of off-policy episodes based on the replay ratio for _ in range(_poisson(args.replay_ratio)): # Act and train off-policy for a batch of (truncated) episode trajectories = memory.sample_batch(args.batch_size, maxlen=args.t_max) # Reset hidden state avg_hx = torch.zeros(args.batch_size, args.hidden_size) avg_cx = torch.zeros(args.batch_size, args.hidden_size) if gpu_id >= 0: with torch.cuda.device(gpu_id): hx = torch.zeros(args.batch_size, args.hidden_size).cuda() cx = torch.zeros(args.batch_size, args.hidden_size).cuda() else: hx = torch.zeros(args.batch_size, args.hidden_size) cx = torch.zeros(args.batch_size, args.hidden_size) # Lists of outputs for training policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], [] # Loop over trajectories (bar last timestep) for i in range(len(trajectories) - 1): # Unpack first half of transition state = torch.cat( tuple(trajectory.state for trajectory in trajectories[i]), 0) action = torch.LongTensor([ trajectory.action for trajectory in trajectories[i] ]).unsqueeze(1) reward = torch.Tensor([ trajectory.reward for trajectory in trajectories[i] ]).unsqueeze(1) old_policy = torch.cat( tuple(trajectory.policy for trajectory in trajectories[i]), 0) # Calculate policy and values policy, Q, V, (hx, cx) = model(state, (hx, cx)) average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( state, (avg_hx, avg_cx)) # Save outputs for offline training [ arr.append(el) for arr, el in zip((policies, Qs, Vs, actions, rewards, average_policies, old_policies), ( policy, Q, V, action, reward, average_policy, old_policy)) ] # Unpack second half of transition next_state = torch.cat( tuple(trajectory.state for trajectory in trajectories[i + 1]), 0) done = torch.Tensor([ trajectory.action is None for trajectory in trajectories[i + 1] ]).unsqueeze(1) # Do forward pass for all transitions _, _, Qret, _ = model(next_state, (hx, cx)) # Qret = 0 for terminal s, V(s_i; θ) otherwise Qret = ((1 - done) * Qret).detach().cpu() # Train the network off-policy if gpu_id >= 0: Qs = list(map(lambda x: x.cpu(), Qs)) Vs = list(map(lambda x: x.cpu(), Vs)) policies = list(map(lambda x: x.cpu(), policies)) _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies, old_policies=old_policies) done = True env.close()
def train(rank, shared_model, optimizer): """ :param rank: worker-ID :param shared_model: model to sync between workers :param optimizer: :return: """ # torch.manual_seed(SEED + rank) ac_steps = 20 max_episode_length = 10000 gamma = 0.99 tau = 1.0 max_grad_norm = 50.0 checkpoint_n = 20 env = create_atari_env(romname) env.seed(SEED + rank) state = env.reset() state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False) model = ActorCritic(env.observation_space.shape[0], env.action_space) t = 0 done = True episodes = 0 reward_sum = 0 reward_sum1 = 0 start_time = time.time() best_reward = -999 isbest = 0 cx = hx = None while True: model.load_state_dict(shared_model.state_dict()) if done: # need to reset LSTM cell's input cx = Variable(torch.zeros(1, 256)).type(FloatTensor) hx = Variable(torch.zeros(1, 256)).type(FloatTensor) else: cx = Variable(cx.data) hx = Variable(hx.data) # basically this is to detach from previous comp graph states = [] values = [] log_probs = [] rewards = [] entropies = [] for i in range(ac_steps): t += 1 v, logit, (hx, cx) = model((state, (hx, cx))) states.append(state) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial().detach() # detach -- so the backprob will NOT go through multinomial() log_prob = log_prob.gather(1, action) action = action.data[0, 0] state, reward, done, _ = env.step(action) reward_sum += reward reward_sum1 += reward done = done or t >= max_episode_length if done: t_ = t t = 0 state = env.reset() episodes += 1 if episodes % 10 == 0: time_str = time.strftime( "%Hh %Mm %Ss", time.gmtime(time.time() - start_time)) print("Time {}, worker-{} episode {} " "mean episode reward {}, " "episode length {}". format(time_str, rank, episodes, reward_sum / 10.0, t_)) reward_sum = 0.0 if episodes % checkpoint_n == 0: ave_reward = reward_sum1 / checkpoint_n if best_reward < ave_reward: isbest = 1 best_reward = ave_reward print("Saving checkpoint Time {}, worker-{} episode {} " "mean episode reward {}, " "episode length {} best_reward {}". format(get_elapsed_time_str(), rank, episodes, ave_reward, t_, best_reward)) checkpoint_fname = os.path.join( args.savedir, args.rom + '_worker' + str(rank) + '_' + str(episodes)) save_checkpoint({'epoch': episodes, 'average_reward': ave_reward, 'time': time.time(), 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, isbest, checkpoint_fname) reward_sum1 = 0.0 state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False) reward = max(min(reward, 1), -1) values.append(v) log_probs.append(log_prob) rewards.append(reward) if done: break # We reach here because either # i) an episode ends, such as game over # ii) we have explored certain steps into the future and now it is # time to look-back and summerise the if done: R = torch.zeros(1, 1).type(FloatTensor) else: value, _, _ = model((state, (hx, cx))) R = value.data values.append(Variable(R)) critic_loss = 0 actor_loss = 0 R = Variable(R) gae = 0 for i in reversed(range(len(rewards))): R = gamma * R + rewards[i] advantage = R - values[i] # type: Variable critic_loss += 0.5 * advantage.pow(2) td_error = rewards[i] + gamma * values[i + 1].data - values[i].data gae = gae * gamma * tau + td_error actor_loss -= (Variable(gae) * log_probs[i] + 0.01 * entropies[i]) optimizer.zero_grad() total_loss = actor_loss + critic_loss * 0.5 # type: Variable total_loss.backward() # error occur torch.nn.utils.clip_grad_norm(model.parameters(), max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
if __name__ == '__main__': env = create_atari_env(args.rom) # torch.manual_seed(SEED) shared_model = ActorCritic(env.observation_space.shape[0], env.action_space) shared_model.share_memory() # print (shared_model.conv1._parameters['weight'].data.is_cuda) optimizer = SharedAdam(shared_model.parameters(), lr=0.0001) optimizer.share_memory() if args.play: if os.path.isfile(args.play): print("=> loading checkpoint '{}'".format(args.play)) checkpoint = torch.load(args.play) # args.start_epoch = checkpoint['epoch'] # best_prec1 = checkpoint['best_prec1'] shared_model.load_state_dict(checkpoint['state_dict']) #optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.play)) test(shared_model, render=1) # let it play the game exit(0) if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) # args.start_epoch = checkpoint['epoch'] # best_prec1 = checkpoint['best_prec1']
def test(rank, args, shared_model): torch.manual_seed(args.seed + rank) env = WrapEnv(args.env_name) model = ActorCritic(4, env.num_actions, args.num_skips) model.eval() state = env.reset() state = np.concatenate([state] * 4, axis=0) state = torch.from_numpy(state) reward_sum = 0 done = True action_stat = [0] * (model.n_real_acts + model.n_aux_acts) start_time = time.time() episode_length = 0 for ep_counter in itertools.count(1): # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) if not os.path.exists('model-a3c-aux'): os.makedirs('model-a3c-aux') torch.save(shared_model.state_dict(), 'model-a3c-aux/model-{}.pth'.format(args.model_name)) print('saved model') value, logit = model(Variable(state.unsqueeze(0), volatile=True)) prob = F.softmax(logit) action = prob.max(1)[1].data.numpy() action_np = action[0, 0] action_stat[action_np] += 1 if action_np < model.n_real_acts: state_new, reward, done, info = env.step(action_np) if args.testing: print('episode', episode_length, 'normal action', action_np, 'lives', info['ale.lives']) env.render() state = np.append(state.numpy()[1:, :, :], state_new, axis=0) done = done or episode_length >= args.max_episode_length reward_sum += reward episode_length += 1 else: state = state.numpy() for _ in range(action_np - model.n_real_acts + 2): state_new, rew, done, info = env.step( 0) # instead of random perform NOOP=0 if args.testing: print('episode', episode_length, 'no_op action', action_np, 'lives', info['ale.lives']) # env.render() state = np.append(state[1:, :, :], state_new, axis=0) done = done or episode_length >= args.max_episode_length reward_sum += rew episode_length += 1 if done: break if done: print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) print("actions stats real {}, aux {}".format( action_stat[:model.n_real_acts], action_stat[model.n_real_acts:])) reward_sum = 0 episode_length = 0 state = env.reset() state = np.concatenate([state] * 4, axis=0) action_stat = [0] * (model.n_real_acts + model.n_aux_acts) if not args.testing: time.sleep(60) state = torch.from_numpy(state)
def test(rank, args, shared_model): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) if not os.path.exists('models-a3c'): os.makedirs('models-a3c') path = 'models-a3c/model-{}.pth'.format(args.model_name) print('saving directory is', path) model = ActorCritic(env.action_space.n, args.num_atoms, args.gamma) model.eval() state = env.reset() state = np.concatenate([state] * 4, axis=0) state = torch.from_numpy(state) reward_sum = 0 done = True action_stat = [0] * model.num_outputs start_time = time.time() episode_length = 0 for ep_counter in itertools.count(1): # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) torch.save(shared_model.state_dict(), path) print('saved model') atoms_logit, logit = model(Variable(state.unsqueeze(0), volatile=True)) prob = F.softmax(logit) action = prob.max(1)[1].data.numpy() action_np = action[0, 0] action_stat[action_np] += 1 state_new, reward, done, info = env.step(action_np) dead = is_dead(info) if args.testing: atoms_prob = F.softmax(atoms_logit) value = model.get_v(atoms_prob, batch=False) atoms_prob = atoms_prob.squeeze().data.numpy() print('episode', episode_length, 'normal action', action_np, 'lives', info['ale.lives'], 'value', value) env.render() if ep_counter % 100 == 0: plt.plot(model.z, atoms_prob) plt.title('average v is {}'.format(value)) plt.show() state = np.append(state.numpy()[1:, :, :], state_new, axis=0) done = done or episode_length >= args.max_episode_length reward_sum += reward episode_length += 1 if done: print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) print("actions stats real {}".format( action_stat[:model.num_outputs])) reward_sum = 0 episode_length = 0 state = env.reset() env.seed(args.seed + rank + (args.num_processes + 1) * ep_counter) state = np.concatenate([state] * 4, axis=0) action_stat = [0] * model.num_outputs if not args.testing: time.sleep(60) state = torch.from_numpy(state)
def test(args, shared_model): action_map = _set_action_map() env = FixedEnvWrap() # time.sleep(10) model = ActorCritic() model.load_state_dict(shared_model.state_dict()) model.eval() state = env.reset() training_time = 0 vis = visdom.Visdom(env='final') line_plot = vis.line(Y=np.array([0]), opts=dict(xlabel='testing count', ylabel='average reward', title='ali-v1')) start = time.time() vis_count = 0 while True: video_count = 1 reward_all_sum = 0 reward_all = 0 reward_all_ave = 0 reward_gop = 0 action = 3 last_action = 3 # update model before testing all trace files # time.sleep(5) print('load updated model') model.load_state_dict(shared_model.state_dict()) while True: # get the reward for one gop while True: _, done, decision_flag = env.step_gop(action) if decision_flag or done: reward_gop = env.get_reward_gop() state = env.get_state_gop() break else: continue # print('testing') # get action from model last_action = action with torch.no_grad(): state = torch.FloatTensor(state) logit, _ = model( state.view(-1, args.s_gop_info, args.s_gop_len)) prob = F.softmax(logit, dim=1) _, action = torch.max(prob, 1) action = action.data.numpy()[0] bitrate, target_buffer = action_map[last_action] # print('bitrate: %d, target_buffer: %d, reward is %s' % (bitrate, target_buffer, reward_gop)) if done: print("video count %d, reward is %.5f" % (video_count, reward_all)) # reward_all_sum += reward_all / 100 reward_all_sum += reward_all video_count += 1 if reward_all < 0: print('bad model ! just break this loop') reward_all_ave = 0 break if video_count > env.traces_len * 2: reward_all_ave = reward_all_sum / video_count break action = 3 last_action = 3 reward_all = 0 reward_all += reward_gop # update the figure of average reward of all testing files vis_count += 1 reward_all_ave = max(reward_all_ave, 0) vis.line(Y=np.array([reward_all_ave]), X=np.array([vis_count]), win=line_plot, update='append') path = 'ali-v1/actor.pt-' + str(vis_count) torch.save(model.state_dict(), path) end = time.time() hours, rem = divmod(end - start, 3600) minutes, seconds = divmod(rem, 60) print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) print("average reward of traces are: ", reward_all_ave) print('saved one model in epoch:', vis_count)
def train(rank, params, shared_model, optimizer): torch.manual_seed(params.seed + rank) # shifting the seed with rank to asynchronize each training agent env = create_atari_env(params.env_name) # creating an optimized environment thanks to the create_atari_env function env.seed(params.seed + rank) # aligning the seed of the environment on the seed of the agent model = ActorCritic(env.observation_space.shape[0], env.action_space) # creating the model from the ActorCritic class state = env.reset() # state is a numpy array of size 1*42*42, in black & white state = torch.from_numpy(state) # converting the numpy array into a torch tensor done = True # when the game is done episode_length = 0 # initializing the length of an episode to 0 while True: # repeat episode_length += 1 # incrementing the episode length by one model.load_state_dict(shared_model.state_dict()) # synchronizing with the shared model - the agent gets the shared model to do an exploration on num_steps if done: # if it is the first iteration of the while loop or if the game was just done, then: cx = Variable(torch.zeros(1, 256)) # the cell states of the LSTM are reinitialized to zero hx = Variable(torch.zeros(1, 256)) # the hidden states of the LSTM are reinitialized to zero else: # else: cx = Variable(cx.data) # we keep the old cell states, making sure they are in a torch variable hx = Variable(hx.data) # we keep the old hidden states, making sure they are in a torch variable values = [] # initializing the list of values (V(S)) log_probs = [] # initializing the list of log probabilities rewards = [] # initializing the list of rewards entropies = [] # initializing the list of entropies for step in range(params.num_steps): # going through the num_steps exploration steps value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) # getting from the model the output V(S) of the critic, the output Q(S,A) of the actor, and the new hidden & cell states prob = F.softmax(action_values) # generating a distribution of probabilities of the Q-values according to the softmax: prob(a) = exp(prob(a))/sum_b(exp(prob(b))) log_prob = F.log_softmax(action_values) # generating a distribution of log probabilities of the Q-values according to the log softmax: log_prob(a) = log(prob(a)) entropy = -(log_prob * prob).sum(1) # H(p) = - sum_x p(x).log(p(x)) entropies.append(entropy) # storing the computed entropy action = prob.multinomial().data # selecting an action by taking a random draw from the prob distribution log_prob = log_prob.gather(1, Variable(action)) # getting the log prob associated to this selected action values.append(value) # storing the value V(S) of the state log_probs.append(log_prob) # storing the log prob of the action state, reward, done, _ = env.step(action.numpy()) # playing the selected action, reaching the new state, and getting the new reward done = (done or episode_length >= params.max_episode_length) # if the episode lasts too long (the agent is stucked), then it is done reward = max(min(reward, 1), -1) # clamping the reward between -1 and +1 if done: # if the episode is done: episode_length = 0 # we restart the environment state = env.reset() # we restart the environment state = torch.from_numpy(state) # tensorizing the new state rewards.append(reward) # storing the new observed reward if done: # if we are done break # we stop the exploration and we directly move on to the next step: the update of the shared model R = torch.zeros(1, 1) # intializing the cumulative reward if not done: # if we are not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) # we initialize the cumulative reward with the value of the last shared state R = value.data # we initialize the cumulative reward with the value of the last shared state values.append(Variable(R)) # storing the value V(S) of the last reached state S policy_loss = 0 # initializing the policy loss value_loss = 0 # initializing the value loss R = Variable(R) # making sure the cumulative reward R is a torch Variable gae = torch.zeros(1, 1) # initializing the Generalized Advantage Estimation to 0 for i in reversed(range(len(rewards))): # starting from the last exploration step and going back in time R = params.gamma * R + rewards[i] # R = gamma*R + r_t = r_0 + gamma r_1 + gamma^2 * r_2 ... + gamma^(n-1)*r_(n-1) + gamma^nb_step * V(last_state) advantage = R - values[i] # R is an estimator of Q at time t = i so advantage_i = Q_i - V(state_i) = R - value[i] value_loss = value_loss + 0.5 * advantage.pow(2) # computing the value loss TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data # computing the temporal difference gae = gae * params.gamma * params.tau + TD # gae = sum_i (gamma*tau)^i * TD(i) with gae_i = gae_(i+1)*gamma*tau + (r_i + gamma*V(state_i+1) - V(state_i)) policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i] # computing the policy loss optimizer.zero_grad() # initializing the optimizer (policy_loss + 0.5 * value_loss).backward() # we give 2x more importance to the policy loss than the value loss because the policy loss is smaller torch.nn.utils.clip_grad_norm(model.parameters(), 40) # clamping the values of gradient between 0 and 40 to prevent the gradient from taking huge values and degenerating the algorithm ensure_shared_grads(model, shared_model) # making sure the model of the agent and the shared model share the same gradient optimizer.step() # running the optimization step
def test(rank, args, T, shared_model): torch.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) model.eval() can_test = True # Test flag t_start = 1 # Test step counter to check against global counter rewards, steps = [], [] # Rewards and steps for plotting l = str(len(str(args.T_max))) # Max num. of digits for logging steps done = True # Start new episode while T.value() <= args.T_max: if can_test: t_start = T.value() # Reset counter # Evaluate over several episodes and average results avg_rewards, avg_episode_lengths = [], [] for _ in range(args.evaluation_episodes): while True: # Reset or pass on hidden state if done: # Sync with shared model every episode model.load_state_dict(shared_model.state_dict()) hx = Variable(torch.zeros(1, args.hidden_size), volatile=True) cx = Variable(torch.zeros(1, args.hidden_size), volatile=True) # Reset environment and done flag state = state_to_tensor(env.reset()) done, episode_length = False, 0 reward_sum = 0 # Optionally render validation states if args.render: env.render() # Calculate policy policy, _, _, (hx, cx) = model(Variable(state, volatile=True), (hx.detach(), cx.detach())) # Break graph for memory efficiency # Choose action greedily action = policy.max(1)[1].data[0, 0] # Step state, reward, done, _ = env.step(action) state = state_to_tensor(state) reward_sum += reward done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter # Log and reset statistics at the end of every episode if done: avg_rewards.append(reward_sum) avg_episode_lengths.append(episode_length) break print(('[{}] Step: {:<' + l + '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format( datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3], t_start, sum(avg_rewards) / args.evaluation_episodes, sum(avg_episode_lengths) / args.evaluation_episodes)) if args.evaluate: return rewards.append(avg_rewards) # Keep all evaluations steps.append(t_start) plot_line(steps, rewards) # Plot rewards torch.save(model.state_dict(), 'model.pth') # Save model params can_test = False # Finish testing else: if T.value() - t_start >= args.evaluation_interval: can_test = True time.sleep(0.001) # Check if available to test every millisecond env.close()
def train(rank, params, shared_model, optimizer): torch.manual_seed(params.seed + rank) env = create_atari_env(params.env_name) #getting the environment env.seed(params.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) state = env.reset() state = torch.from_numpy(state) done = True episode_length = 0 while True: episode_length+=1 model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1,256)) hx = Variable(torch.zeros(1,256)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] for step in range(params.num_steps): value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(action_values) log_prob = F.log_softmax(action_values) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) values.append(value) log_probs.append(log_prob) state, reward, done = env.step(action.numpy()) done = (done or episode_length >= params.max_episode_length) reward = max(min(reward,1), -1) if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state) rewards.append(reward) if done: break R = torch.zeros(1,1) if not done: value, _, _ = model.((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1,1) for i in reversed(range(len(rewards))): R = params.gamma*R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) TD = rewards[i] + params.gamma * values[i+1].data - values[i].data gae = gae * params.gamma * params.tau + TD policy_loss = policy_loss - log_probs[i]*Variable(gae) - 0.01*entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step()
def train(rank, args, share_model, counter, lock): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) optimizer = optim.Adam(share_model.parameters(), lr=args.lr) model.train() state = env.reset() state = torch.FloatTensor(state) done = True # reward_sum = 0 episode_length = 0 while True: model.load_state_dict(share_model.state_dict()) if done: cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): episode_length += 1 value, logit, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) state, reward, done, _ = env.step(action.numpy()) # print('reward', reward) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) # reward_sum += reward # print(reward) with lock: counter.value += 1 if done: episode_length = 0 state = env.reset() state = torch.FloatTensor(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: # print('rank: ', rank) # print('reward: ', reward_sum) # reward_sum = 0 break R = torch.zeros(1, 1) if not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) delta_t = rewards[i] + args.gamma * values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - log_probs[i] * Variable(gae) - args.entropy_coef * entropies[i] optimizer.zero_grad() (policy_loss + args.value_loss_coef * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, share_model) optimizer.step()
def train(rank, args, shared_model, optimizer=None): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() state = env.reset() state = torch.from_numpy(state) done = True episode_length = 0 while True: episode_length += 1 # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): value, logit, (hx, cx) = model( (Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) state, reward, done, _ = env.step(action.numpy()) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - 0.01 * entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step()
def test(rank, args, shared_model, counter, loggers, kill): counter, steps, max_episodes = counter torch.manual_seed(args.seed + rank) env = create_vizdoom_env(args.config_path, args.test_scenario_path) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.spaces[0].shape[0], env.action_space, args.topology) model.eval() state = env.reset() reward_sum = 0 done = True start_time = time.time() # a quick hack to prevent the agent from stucking hidden = ((torch.zeros(1, 64), torch.zeros(1, 64)), (torch.zeros(1, 256), torch.zeros(1, 256))) actions = deque(maxlen=100) episode_length = 0 episode_counter = 0 obs_index = 0 obs_history = [] pose_history = [] goal_loc = env.goal() model.load_state_dict(shared_model.state_dict()) while not kill.is_set(): if steps.value > args.max_episode_steps: break if episode_counter > max_episodes: break try: episode_start_time = time.time() episode_length += 1 value, logit, _, _, hidden = model((state_to_torch(state), hidden)) prob = F.softmax(logit) action = prob.max(1, keepdim=True)[1].data.numpy() for i in range(4): state, reward, done, _ = env.step(action[0, 0], steps=1) reward_sum += reward if done: break else: obs_frame = (np.moveaxis(state[0], 0, -1) * 255).astype( np.uint8) if isinstance(obs_history, list): obs_history.append(obs_frame) else: obs_history[obs_index, :, :, :] = obs_frame obs_index += 1 pose_history.append(env.pose()) # a quick hack to prevent the agent from stucking # actions.append(action[0, 0]) # if actions.count(actions[0]) == actions.maxlen: # done = True if done: if isinstance(obs_history, list): obs_history = np.array(obs_history) if loggers: loggers['test_reward'](env.game.get_total_reward(), episode_counter) loggers['video'](video(env.wad, env.current_map, goal_loc, obs_history, pose_history), episode_counter) loggers['test_time'](time.time() - episode_start_time, episode_counter) print( "Time {}, num episodes {}, FPS {:.0f}, episode reward {}, episode length {}". format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() obs_index = 0 pose_history = [] goal_loc = env.goal() hidden = ((torch.zeros(1, 64), torch.zeros(1, 64)), (torch.zeros(1, 256), torch.zeros(1, 256))) time.sleep(args.eval_interval) model.load_state_dict(shared_model.state_dict()) episode_counter += 1 except Exception as err: kill.set() raise err