def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) net = QNet(num_inputs, num_actions) optimizer = optim.Adam(net.parameters(), lr=lr) writer = SummaryWriter('logs') net.to(device) net.train() running_score = 0 steps = 0 loss = 0 for e in range(3000): done = False memory = Memory() score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 action = net.get_action(state) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 action_one_hot = torch.zeros(2) action_one_hot[action] = 1 memory.push(state, next_state, action_one_hot, reward, mask) score += reward state = next_state loss = QNet.train_model(net, memory.sample(), optimizer) score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print('{} episode | score: {:.2f}'.format(e, running_score)) writer.add_scalar('log/score', float(running_score), e) writer.add_scalar('log/loss', float(loss), e) if running_score > goal_score: break
class Agent(): def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.qnetwork_local = QNet(state_size, action_size, seed).to(device) self.qnetwork_target = QNet(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay Memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.1): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # For normal DQN #Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # For double DQN Q_targets_next = np.argmax(self.qnetwork_local(next_states).detach(),axis=-1).unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_targets_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(Q_expected, Q_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) target_net.load_state_dict(online_net.state_dict()) online_net.share_memory() target_net.share_memory() optimizer = SharedAdam(online_net.parameters(), lr=lr) global_ep, global_ep_r, res_queue = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue() writer = SummaryWriter('logs') online_net.to(device) target_net.to(device) online_net.train() target_net.train() workers = [ Worker(online_net, target_net, optimizer, global_ep, global_ep_r, res_queue, i) for i in range(mp.cpu_count()) ] [w.start() for w in workers] res = [] while True: r = res_queue.get() if r is not None: res.append(r) [ep, ep_r, loss] = r writer.add_scalar('log/score', float(ep_r), ep) writer.add_scalar('log/loss', float(loss), ep) else: break [w.join() for w in workers]
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) ### ポリシーネットワークの構築 ### inputに対してπ(a|s) と V(s) が出力される ### Vの出力は1つ が学習時にはAdvantage関数を計算する net = QNet(num_inputs, num_actions) optimizer = optim.Adam(net.parameters(), lr=lr) net.to(device) net.train() ### もろもろの初期化 running_score = 0 steps = 0 loss = 0 steps_before = 0 df = pd.DataFrame(index=range(10000), columns=["steps", "loss_policy", "loss_value"]) memory = Memory() for e in range(10000): done = False ### 1エピソード分のメモリすら持たずに1ステップずつ学習 ### 環境を初期状態に score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 ### epsilon は使わず、各行動の評価値を確率に直接変換して行動を決定 action = net.get_action(state) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 action_one_hot = torch.zeros(num_actions) action_one_hot[action] = 1 transition = [state, next_state, action, reward, mask] memory.push(state, next_state, action_one_hot, reward, mask) score += reward state = next_state steps_before = steps score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % 16 == 0: ### 16ステップごとに、まとめて学習 loss, loss_policy, loss_value = QNet.train_model( net, optimizer, memory.sample()) ### メモリの初期化 memory = Memory() df.loc[e, "steps"] = running_score df.loc[e, "loss_policy"] = loss_policy df.loc[e, "loss_value"] = loss_value print( "Ep {0:04d}: score: {1:02d}, loss_policy: {2}, loss_value: {3}" .format(e, int(running_score), loss_policy, loss_value)) if running_score > goal_score: break df.to_csv("loss.csv")
def train(render): online_net = QNet(h=84, w=84, outputs=36) online_net.load_state_dict(torch.load('saved/online_net.pt')) target_net = QNet(h=84, w=84, outputs=36) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) online_net.to(device) target_net.to(device) online_net.train() target_net.train() memory = Memory(replay_memory_capacity) memory = torch.load('saved/model_memory.pt') epsilon = 0.1 steps = 0 beta = beta_start loss = 0 for e in range(100000): #level = random.choice(LEVEL_SET) level = 'Level01' env = make_retro(game=env_name, state=level, use_restricted_actions=retro.Actions.DISCRETE) done = False total_reward = 0.0 state = env.reset() state = torch.Tensor(state).to(device).permute(2, 0, 1) #state = state.view(state.size()[0], -1) state = state.unsqueeze(0) while not done: steps += 1 action = get_action(state.to(device), target_net, epsilon, env) if render: env.render() next_state, reward, done, info = env.step(action) next_state = torch.Tensor(next_state).permute(2, 0, 1) #next_state = next_state.view(next_state.size()[0], -1) next_state = next_state.unsqueeze(0) total_reward += reward mask = 0 if done else 1 action_one_hot = torch.zeros(36) action_one_hot[action] = 1 reward = torch.tensor([info['score']]).to(device) memory.push(state, next_state, action_one_hot, reward, mask) state = next_state if len(memory) > initial_exploration: epsilon -= 0.00005 epsilon = max(epsilon, 0.02) beta += 0.00005 beta = min(1, beta) batch, weights = memory.sample(batch_size, online_net, target_net, beta) loss = QNet.train_model(online_net, target_net, optimizer, batch, weights) if steps % update_target == 0: update_target_model(online_net, target_net) if e % 1 == 0: print('{} episode | Total Reward: {}'.format(e, total_reward)) torch.save(online_net.state_dict(), 'saved/online_net.pt') torch.save(memory, 'saved/model_memory.pt') env.close()
def main(L, mouse_initial_indices, rewardlist, actions_list): if mouse_initial_indices is None: all_possible_starting_positions = np.array([*np.where(L == 1)]).T scores = [0] best_scores = [0] env = deepcopy(L) torch.manual_seed(2020) num_inputs = 2 + 1 num_actions = 4 print('state size:', num_inputs) print('action size:', num_actions) online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) # writer = SummaryWriter('logs') online_net.to(device) target_net.to(device) online_net.train() target_net.train() memory = Memory(replay_memory_capacity) running_score = 0 epsilon = 1.0 steps = 0 loss = 0 inint = mouse_initial_indices best_score = 0 number_episode = 1000 for e in range(number_episode): if inint is None: mouse_initial_indices = all_possible_starting_positions[ np.random.choice(range(len(all_possible_starting_positions)))] done = False env = deepcopy(L) eaubue = 0. score = 0 state = np.array(mouse_initial_indices) state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 action = get_action(state, target_net, epsilon, env, eaubue=eaubue) newstate = state + torch.Tensor(np.array( actions_list[action])).to(device) if env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist())] != 0: next_state = newstate new_eaubue = eaubue reward = rewardlist[env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist())]] if env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist())] == 2: done = True if env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist() )] == 4: #if the mouse is in the water env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist() )] = 5 #there is no more water new_eaubue = 1. else: next_state = state reward = rewardlist[0] new_eaubue = eaubue mask = 0 if done else 1 action_one_hot = np.zeros(4) action_one_hot[action] = 1 memory.push( torch.cat(( state, torch.tensor(eaubue).unsqueeze(0).unsqueeze(0).to(device)), 1), torch.cat((next_state, torch.tensor(new_eaubue).unsqueeze( 0).unsqueeze(0).to(device)), 1), action_one_hot, reward, mask) score += reward state = next_state eaubue = new_eaubue if steps > initial_exploration: epsilon -= 0.00005 epsilon = max(epsilon, 0.1) batch = memory.sample(batch_size) loss = QNet.train_model(online_net, target_net, optimizer, batch) if steps % update_target == 0: update_target_model(online_net, target_net) # print("OK") if score > 35: print(score) running_score = 0.99 * running_score + 0.01 * score # running_score=score scores.append(running_score) best_scores.append( score if score > best_scores[-1] else best_scores[-1]) if e % log_interval == 0: print( '{} episode | score: {:.2f} | best score: {:.2f} | epsilon: {:.2f}' .format(e, running_score, best_score, epsilon)) # writer.add_scalar('log/score', float(running_score), e) # writer.add_scalar('log/loss', float(loss), e) if score > best_score: best_score = score torch.save(online_net.state_dict(), "./qlearning_model") if running_score > goal_score: break return number_episode, scores, best_scores
class Agent(): def __init__(self, args, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.per = args.per self.dueling = args.dueling self.buffer_size = args.buffer_size self.batch_size = args.batch_size self.gamma = args.gamma self.tau = args.tau self.lr = args.learning_rate self.update_freq = args.update_every # Q-Network if self.dueling: self.local_qnet = DuelingQNet(state_size, action_size, seed).to(device) self.target_qnet = DuelingQNet(state_size, action_size, seed).to(device) else: self.local_qnet = QNet(state_size, action_size, seed).to(device) self.target_qnet = QNet(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.local_qnet.parameters(), lr=self.lr) # Replay Memory if self.per: self.memory = PrioritizedReplayMemory(args, self.buffer_size) else: self.memory = ReplayMemory(action_size, self.buffer_size, self.batch_size, seed) self.t_step = 0 # init time step for updating every UPDATE_EVERY steps def step(self, state, action, reward, next_state, done): if self.per: self.memory.append(state, action, reward, next_state, done) else: self.memory.add(state, action, reward, next_state, done) # save experience to replay memory. # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_freq if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: if self.dueling: self.learn_DDQN(self.gamma) else: self.learn(self.gamma) def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.local_qnet.eval() with torch.no_grad(): action_values = self.local_qnet(state) self.local_qnet.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, gamma): if self.per: idxs, states, actions, rewards, next_states, dones, weights = self.memory.sample( self.batch_size) else: states, actions, rewards, next_states, dones = self.memory.sample() # Get max predicted Q values for next states from target model Q_targets_next = self.target_qnet(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.local_qnet(states).gather(1, actions) # Compute loss - element-wise mean squared error # Now loss is a Tensor of shape (1,) # loss.item() gets the scalar value held in the loss. loss = F.mse_loss(Q_expected, Q_targets) # Minimize loss self.optimizer.zero_grad() if self.per: (weights * loss).mean().backward( ) # Backpropagate importance-weighted minibatch loss else: loss.backward() self.optimizer.step() if self.per: errors = np.abs((Q_expected - Q_targets).detach().cpu().numpy()) self.memory.update_priorities(idxs, errors) # Update target network self.soft_update(self.local_qnet, self.target_qnet, self.tau) def learn_DDQN(self, gamma): if self.per: idxs, states, actions, rewards, next_states, dones, weights = self.memory.sample( self.batch_size) else: states, actions, rewards, next_states, dones = self.memory.sample() # Get index of maximum value for next state from Q_expected Q_argmax = self.local_qnet(next_states).detach() _, a_prime = Q_argmax.max(1) # Get max predicted Q values for next states from target model Q_targets_next = self.target_qnet(next_states).detach().gather( 1, a_prime.unsqueeze(1)) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.local_qnet(states).gather(1, actions) # Compute loss # Now loss is a Tensor of shape (1,) # loss.item() gets the scalar value held in the loss. loss = F.mse_loss(Q_expected, Q_targets) # Minimize loss self.optimizer.zero_grad() if self.per: (weights * loss).mean().backward( ) # Backpropagate importance-weighted minibatch loss else: loss.backward() self.optimizer.step() if self.per: errors = np.abs((Q_expected - Q_targets).detach().cpu().numpy()) self.memory.update_priorities(idxs, errors) # Update target network self.soft_update(self.local_qnet, self.target_qnet, self.tau) def soft_update(self, local_model, target_model, tau): # θ_target = τ*θ_local + (1 - τ)*θ_target for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
render_map = False num_inputs = env.observation_space.shape num_actions = len(env.action_names[0]) print('state size:', num_inputs) print('action size:', num_actions) model = QNet(num_actions) model.apply(weights_init) target_model = QNet(num_actions) update_target_model(model, target_model) model.train() target_model.train() optimizer = optim.Adam(model.parameters(), lr=hp.lr, weight_decay=hp.l2_rate) memory = Memory(100000) if render_map: root, canvas = init_map() steps = 0 scores = [] epsilon = 1.0 for episode in range(hp.num_episodes): state = env.reset() state = pre_process(state) history = np.stack((state, state, state, state), axis=2) history = np.reshape([history], (84, 84, 4))
def main(): if not (os.path.isdir("logs")): os.makedirs("logs") if (args.entropy and args.boltzmann): raise ValueError("Entropy as well as Boltzmann set.") print(args) working_dir = "logs/" + args.dir if not (os.path.isdir(working_dir)): os.mkdir(working_dir) env = QubeSwingupEnv(use_simulator=True) num_inputs = env.observation_space.shape[0] num_actions = NUMBER_OF_ACTIONS print('state size:', num_inputs) print('action size:', num_actions) online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) writer = SummaryWriter(working_dir) online_net.to(device) target_net.to(device) online_net.train() target_net.train() memory = Memory_With_TDError(replay_memory_capacity) running_score = 0 epsilon = 1.0 steps = 0 beta = beta_start loss = 0 training_started = False best_running_score = -1000 for e in range(args.e): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) start_time = time.time() while not done: steps += 1 action = get_action(state, target_net, epsilon, use_entropy=args.entropy, use_boltzmann=args.boltzmann) next_state, reward, done, info = env.step( get_continuous_action(action)) reward = give_me_reward(info["alpha"], info["theta"]) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 action_one_hot = np.zeros(NUMBER_OF_ACTIONS) action_one_hot[action] = 1 memory.push(state, next_state, action_one_hot, reward, mask) score += reward state = next_state if steps > initial_exploration: if not training_started: print("---------------- training started ---------------") training_started = True epsilon -= 0.000005 epsilon = max(epsilon, 0.1) beta += 0.000005 beta = min(1, beta) batch, weights = memory.sample(batch_size, online_net, target_net, beta) loss = QNet.train_model(online_net, target_net, optimizer, batch, weights, device) if steps % update_target == 0: update_target_model(online_net, target_net) end_time = time.time() running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print( '{} episode | score: {:.2f} | epsilon: {:.2f} | beta: {:.2f}'. format(e, running_score, epsilon, beta)) writer.add_scalar('log/score', float(running_score), e) writer.add_scalar('log/loss', float(loss), e) if running_score > best_running_score and args.save: torch.save(online_net.state_dict(), working_dir + "/best_model.pth") best_running_score = running_score
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) ### NNのIn-Outは環境によって異なる num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) ### 2つのNWを作成・初期化 online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) ### 各NWの設定 CPU / GPU online_net.to(device) target_net.to(device) ### 各NWの設定 初めは学習モードにする online_net.train() target_net.train() ### 学習前の初期設定 memory = Memory(replay_memory_capacity) running_score = 0 epsilon = 1.0 steps = 0 loss = 0 steps_before = 0 for e in range(3000): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 ### 行動の決定はtarget_netで行う action = get_action(state, target_net, epsilon, env) ### 次の状態の観測、報酬の獲得 next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) if e % 10 == 0: print(next_state, action, reward) ### わかりにくいので書き変えた if done: mask = 0 else: mask = 1 ### memoryに記録 action_one_hot = np.zeros(num_actions) action_one_hot[action] = 1 memory.push(state, next_state, action_one_hot, reward, mask) ### rewardは基本的に-1 score += reward ### そのepisodeで何ステップ行ったかを記録するためだけのもの state = next_state if steps > initial_exploration: epsilon -= 0.00005 epsilon = max(epsilon, 0.1) ### online_net の学習 batch = memory.sample(batch_size) loss = QNet.train_model(online_net, target_net, optimizer, batch) ### たまにtarget_netをonline_netで上書きする if steps % update_target == 0: update_target_model(online_net, target_net) print("Ep {0:04d}: {1} step".format(e, steps - steps_before)) steps_before = steps score = score if score == 200.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( e, running_score, epsilon)) if running_score > goal_score: break
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) ### NNのIn-Outは環境によって異なる # num_inputs = env.observation_space.shape[0] num_inputs = 1024 num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) ### 2つのNWを作成・初期化 online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) ### 各NWの設定 CPU / GPU online_net.to(device) target_net.to(device) ### 各NWの設定 初めは学習モードにする online_net.train() target_net.train() ### 特徴抽出用の学習済みモデル # pre_model = models.resnet50(pretrained=True) # pre_model.fc = nn.Identity() pre_model = models.squeezenet1_0(pretrained=True) pre_model.classifier = nn.AdaptiveAvgPool2d((1, 1)) pre_model.to(device) def state_to_feature(state): state_img = render_cv2img(state[0], state[2]) state_img = cv2.resize(state_img, (224, 224))[:, :, 0] state_img = state_img.reshape((1, 224, 224)) state_img_rgb = np.zeros((1, 3, 224, 224)) state_img_rgb[:, 0] = state_img state_img_rgb[:, 1] = state_img state_img_rgb[:, 2] = state_img state_img_rgb_tensor = torch.Tensor(state_img_rgb).to(device) state_feature = pre_model(state_img_rgb_tensor) return state_feature ### メモリの保存場所(改修中) memory_dir = "memory/" memory = Memory(replay_memory_capacity, memory_dir) ### 学習前の初期設定 running_score = 0 epsilon = 1.0 steps = 0 loss = 0 steps_before = 0 for e in range(3000): done = False score = 0 ### state = [位置, 速度, 角度, 角速度] state = env.reset( ) ### [-0.01517264 0.02423424 0.02480018 -0.04009749] ### state = [[2048次元のベクトル]] state = state_to_feature(state) ### 前の時間の情報が無いときついため、それを入れるためのもの 最初はstateと同値でよさそう previous_state = state while not done: steps += 1 ### 行動の決定はtarget_netで行う previous_present_state = torch.cat((previous_state, state), 1) action = get_action(previous_present_state, target_net, epsilon, env) ### 次の状態の観測、報酬の獲得 next_state, reward, done, _ = env.step(action) next_state = state_to_feature(next_state) present_next_state = torch.cat((state, next_state), 1) ### わかりにくいので書き変えた if done: mask = 0 else: mask = 1 if (done and (score != 499)): ### 499ステップまで行かずにdoneになったら reward = -1 else: pass ### rewardは基本的に1 ### memoryに記録 action_one_hot = np.zeros(2) action_one_hot[action] = 1 memory.push(previous_present_state, present_next_state, action_one_hot, reward, mask) ### rewardは基本的に1 score += reward ### そのepisodeで何ステップ行ったかを記録するためだけのもの if steps > initial_exploration: epsilon -= 0.00005 epsilon = max(epsilon, 0.1) ### online_net の学習 batch = memory.sample(batch_size) loss = QNet.train_model(online_net, target_net, optimizer, batch) ### たまにtarget_netをonline_netで上書きする if steps % update_target == 0: update_target_model(online_net, target_net) ### 次のステップ previous_state = state state = next_state print("Ep {0:04d}: {1} step".format(e, steps - steps_before)) steps_before = steps score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( e, running_score, epsilon)) if running_score > goal_score: break
train_loader = DataLoader(train_data, shuffle=True, batch_size=args.batch, num_workers=8, pin_memory=True) #create the loader for the validation set val_data = HdrVdpDataset(val_data, args.data, args.group, bPrecompGroup = args.groupprecomp) val_loader = DataLoader(val_data, shuffle=False, batch_size=args.batch, num_workers=8, pin_memory=True) #create the loader for the testing set test_data = HdrVdpDataset(test_data, args.data, args.group, bPrecompGroup = args.groupprecomp) test_loader = DataLoader(test_data, shuffle=False, batch_size=args.batch, num_workers=8, pin_memory=True) #create the model if(torch.cuda.is_available()): model = QNet().cuda() else: model = QNet() #create the optmizer optimizer = Adam(model.parameters(), lr=args.lr) scheduler = ReduceLROnPlateau(optimizer, patience=15, factor=0.5, verbose=True) log = pd.DataFrame() #training loop best_mse = None a_t = [] a_v = [] a_te = [] start_epoch = 1 if args.resume: ckpt_dir_r = os.path.join(args.resume, 'ckpt') ckpts = glob2.glob(os.path.join(ckpt_dir_r, '*.pth')) assert ckpts, "No checkpoints to resume from!"
# Build environment env = make_atari('PongNoFrameskip-v4', stack=2) env = wrap_pytorch(env) number_actions = env.action_space.n replay_buffer = ReplayBuffer(replay_memory_size) # Separate target net & policy net input_shape = env.reset().shape current_net = QNet(input_shape, number_actions).to(device) target_net = QNet(input_shape, number_actions).to(device) # with older weights target_net.load_state_dict(current_net.state_dict()) target_net.eval() optimizer = opt_algorithm(current_net.parameters(), lr=learning_rate) n_episode = 1 episode_return = 0 best_return = 0 returns = [] state = env.reset() for i in count(): # env.render() eps = get_epsilon(i) action = select_action(state, current_net, eps, number_action=number_actions) next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done)
class PPO(Algorithm): def __init__(self, *largs, **kwargs): super(PPO, self).__init__(*largs, **kwargs) self.pi_net = PiNet(self.ns, self.na, distribution='Normal', bounded=False, agent='ppo').to(self.device) self.v_net = QNet(self.ns, 0, agent='ppo').to(self.device) self.optimizer_v = torch.optim.Adam(self.v_net.parameters(), lr=self.lr_q, betas=(0.9, 0.999), weight_decay=self.weight_decay_q) self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(), lr=self.lr_p, betas=(0.9, 0.999), weight_decay=self.weight_decay_p) def play(self, env, evaluate=False): with torch.no_grad(): a = self.pi_net(env.s, evaluate=evaluate) if not (self.env_steps >= self.warmup_steps or evaluate): a = None state = env(a) state['logp'] = self.pi_net.log_prob(state['a']).detach() return state def episodic_training(self, train_results, tail): episode = self.replay_buffer.get_tail(tail) sl = episode['s'] sl = list(torch.chunk(sl, int((len(sl) / self.batch) + 1))) s, r, t, e = [episode[k] for k in ['s', 'r', 't', 'e']] v = [] for s in sl: v.append(self.v_net(s)) v.append(torch.zeros_like(v[0][:1])) v = torch.cat(v).detach() v1, v2 = v[:-1], v[1:] adv, v_target = generalized_advantage_estimation( r, t, e, v1, v2, self.gamma, self.lambda_gae, norm=self.norm_rewards) episode['adv'] = adv episode['v_target'] = v_target if self.batch_ppo: n = self.steps_per_episode * self.batch indices = torch.randperm(tail * max(1, n // tail + 1)) % tail indices = indices[:n].unsqueeze(1).view(self.steps_per_episode, self.batch) samples = {k: v[indices] for k, v in episode.items()} iterator_pi = iter_dict(samples) iterator_v = iter_dict(samples) else: iterator_pi = itertools.repeat(episode, self.steps_per_episode) iterator_v = itertools.repeat(episode, self.steps_per_episode) for i, sample in enumerate(iterator_pi): s, a, r, t, stag, adv, v_target, log_pi_old = [ sample[k] for k in ['s', 'a', 'r', 't', 'stag', 'adv', 'v_target', 'logp'] ] self.pi_net(s) log_pi = self.pi_net.log_prob(a) ratio = torch.exp((log_pi - log_pi_old).sum(dim=1)) clip_adv = torch.clamp(ratio, 1 - self.eps_ppo, 1 + self.eps_ppo) * adv loss_p = -(torch.min(ratio * adv, clip_adv)).mean() approx_kl = -float((log_pi - log_pi_old).sum(dim=1).mean()) ent = float(self.pi_net.entropy().sum(dim=1).mean()) if approx_kl > self.target_kl: train_results['scalar']['pi_opt_rounds'].append(i) break clipped = ratio.gt(1 + self.eps_ppo) | ratio.lt(1 - self.eps_ppo) clipfrac = float( torch.as_tensor(clipped, dtype=torch.float32).mean()) self.optimizer_p.zero_grad() loss_p.backward() if self.clip_p: nn.utils.clip_grad_norm(self.pi_net.parameters(), self.clip_p) self.optimizer_p.step() train_results['scalar']['loss_p'].append(float(loss_p)) train_results['scalar']['approx_kl'].append(approx_kl) train_results['scalar']['ent'].append(ent) train_results['scalar']['clipfrac'].append(clipfrac) for sample in iterator_v: s, a, r, t, stag, adv, v_target, log_pi_old = [ sample[k] for k in ['s', 'a', 'r', 't', 'stag', 'adv', 'v_target', 'logp'] ] v = self.v_net(s) loss_v = F.mse_loss(v, v_target, reduction='mean') self.optimizer_v.zero_grad() loss_v.backward() if self.clip_q: nn.utils.clip_grad_norm(self.v_net.parameters(), self.clip_q) self.optimizer_v.step() train_results['scalar']['loss_v'].append(float(loss_v)) return train_results
def main(): ### 環境を初期化 env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) ### ポリシーネットワークの構築 net = QNet(num_inputs, num_actions) optimizer = optim.Adam(net.parameters(), lr=lr) net.to(device) net.train() ### もろもろの初期化 running_score = 0 steps = 0 loss = 0 steps_before = 0 for e in range(10000): done = False ### 1エピソードごとにMemoryは空にする(実質、Experience Replay がない) memory = Memory() ### 環境を初期状態に score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 ### epsilon は使わず、各行動の評価値を確率に直接変換して行動を決定 action = net.get_action(state) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 action_one_hot = torch.zeros(num_actions) action_one_hot[action] = 1 memory.push(state, next_state, action_one_hot, reward, mask) score += reward state = next_state ### 1エピソード分をまとめて学習 ### memory.sample はランダムに選択ではなく、1エピソードのmemory全体を返す loss = QNet.train_model(net, optimizer, memory.sample()) print("Ep {0:04d}: {1} step".format(e, steps - steps_before)) steps_before = steps score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print('{} episode | score: {:.2f}'.format(e, running_score)) if running_score > goal_score: break
class Agent(): """Agent definition for interacting with environment""" def __init__(self, state_size, action_size, seed): """ Params ====== state_size (int): state dimension action_size (int): action dimension seed (int): random seed for replicating experiment """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.QNet_local = QNet(state_size, action_size, seed).to(device) self.QNet_target = QNet(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.QNet_local.parameters(), lr=LR) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Add current experience to replay memory self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Get favored action Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.QNet_local.eval() with torch.no_grad(): action_values = self.QNet_local(state) self.QNet_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Perform learning on experiences Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences Q_targets_next = self.QNet_target(next_states).detach().max( 1)[0].unsqueeze(1) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.QNet_local(states).gather(1, actions) loss = F.mse_loss(Q_expected, Q_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.QNet_local, self.QNet_target, TAU) def soft_update(self, local_model, target_model, tau): """ θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): model to copy weights from target_model (PyTorch model): copy to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) update_target_model(net, target_net) optimizer = optim.Adam(net.parameters(), lr=0.001) writer = SummaryWriter('logs') if not os.path.isdir(args.save_path): os.makedirs(args.save_path) net.to(device) target_net.to(device) net.train() target_net.train() memory = Memory(10000) running_score = 0 epsilon = 1.0 steps = 0 for e in range(3000): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: if args.render: env.render() steps += 1 qvalue = net(state) action = get_action(epsilon, qvalue, num_actions) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 memory.push(state, next_state, action, reward, mask) score += reward state = next_state if steps > args.initial_exploration: epsilon -= 0.00005 epsilon = max(epsilon, 0.1) batch = memory.sample(args.batch_size) train_model(net, target_net, optimizer, batch, args.batch_size) if steps % args.update_target: update_target_model(net, target_net) score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % args.log_interval == 0: print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( e, running_score, epsilon)) writer.add_scalar('log/score', float(score), running_score) if running_score > args.goal_score: ckpt_path = args.save_path + 'model.pth' torch.save(net.state_dict(), ckpt_path) print('running score exceeds 400 so end') break
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) ### ポリシーネットワークの構築 ### inputに対してπ(a|s) と Q(s, a) が出力される ### 次元とユニット数は2つで同じ net = QNet(num_inputs, num_actions) optimizer = optim.Adam(net.parameters(), lr=lr) net.to(device) net.train() ### もろもろの初期化 running_score = 0 steps = 0 loss = 0 steps_before = 0 df = pd.DataFrame(index=range(10000), columns=["steps", "loss_policy", "loss_value"]) for e in range(10000): done = False ### 1エピソード分のメモリすら持たずに1ステップずつ学習 ### 環境を初期状態に score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) lp = [] lv = [] while not done: steps += 1 ### epsilon は使わず、各行動の評価値を確率に直接変換して行動を決定 action = net.get_action(state) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 transition = [state, next_state, action, reward, mask] score += reward state = next_state ### 1ステップごとに、そのステップの結果のみを学習 loss, loss_policy, loss_value = QNet.train_model(net, optimizer, transition) # loss = QNet.train_model(net, optimizer, transition) lp.append(loss_policy.item()) lv.append(loss_value.item()) lp = np.asarray(lp[:-1]).sum() / (len(lp) - 1) lv = np.asarray(lv[:-1]).sum() / (len(lv) - 1) print("Ep {0:04d}: {1} step, loss_policy: {2}, loss_value: {3}".format(e, steps - steps_before, lp, lv)) # print("Ep {0:04d}: {1} step".format(e, steps - steps_before)) df.loc[e, "steps"] = steps - steps_before df.loc[e, "loss_policy"] = lp df.loc[e, "loss_value"] = lv steps_before = steps score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print('{} episode | score: {:.2f}'.format(e, running_score)) if running_score > goal_score: break df.to_csv("loss.csv")
class Learner: def __init__(self, n_actors, shared_dict, device='cuda:0'): # params self.gamma = 0.99 self.alpha = 0.6 self.bootstrap_steps = 3 self.initial_exploration = 50000 self.priority_epsilon = 1e-6 self.device = device self.n_epochs = 0 self.n_actors = n_actors # path self.memory_path = os.path.join('./', 'logs', 'memory') # memory self.burn_in_length = 10 self.learning_length = 10 self.sequence_length = self.burn_in_length + self.learning_length self.memory_size = 500000 self.batch_size = 8 self.memory_load_interval = 20 self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps) # net self.shared_dict = shared_dict self.net_save_interval = 100 self.target_update_interval = 1000 self.net = QNet(self.device).to(self.device) self.target_net = QNet(self.device).to(self.device) self.target_net.load_state_dict(self.net.state_dict()) self.save_model() self.optim = optim.RMSprop(self.net.parameters(), lr=0.00025 / 4.0, alpha=0.95, eps=1.5e-7, centered=True) def run(self): while True: if self.replay_memory.size > self.initial_exploration: self.train() if self.n_epochs % 100 == 0: print('trained', self.n_epochs, 'epochs') self.interval() def train(self): batch, seq_index, index = self.replay_memory.sample(self.device) self.net.set_state(batch['hs'], batch['cs']) self.target_net.set_state(batch['target_hs'], batch['target_cs']) ### burn-in step ### state = batch['state'][:self.burn_in_length] next_state = batch['next_state'][:self.burn_in_length] with torch.no_grad(): _ = self.net(state) _ = self.target_net(next_state) ### learning step ### state = batch['state'][self.burn_in_length:] next_state = batch['next_state'][self.burn_in_length:] # q_value q_value = self.net(state).gather(1, batch['action'].view(-1, 1)) # target q_value with torch.no_grad(): next_action = torch.argmax(self.net(next_state), 1).view(-1, 1) next_q_value = self.target_net(next_state).gather(1, next_action) target_q_value = batch["reward"].view( -1, 1) + (self.gamma**self.bootstrap_steps) * next_q_value * ( 1 - batch['done'].view(-1, 1)) # update self.optim.zero_grad() loss = torch.mean(0.5 * (q_value - target_q_value)**2) loss.backward() self.optim.step() priority = (np.abs( (q_value - target_q_value).detach().cpu().numpy()).reshape(-1) + self.priority_epsilon)**self.alpha self.replay_memory.update_priority( index[self.burn_in_length:].reshape(-1), priority) self.replay_memory.update_sequence_priority(seq_index, True) def interval(self): self.n_epochs += 1 if self.n_epochs % self.target_update_interval == 0: self.target_net.load_state_dict(self.net.state_dict()) if self.n_epochs % self.net_save_interval == 0: self.save_model() if self.n_epochs % self.memory_load_interval == 0: for i in range(self.n_actors): self.replay_memory.load(self.memory_path, i) def save_model(self): self.shared_dict['net_state'] = deepcopy(self.net).cpu().state_dict() self.shared_dict['target_net_state'] = deepcopy( self.target_net).cpu().state_dict()
def main(): # cartpole test if (cartpole_test): envs_fun = [lambda: gym.make('CartPole-v0')] envs_fun = np.tile(envs_fun, 3) envs = ShmemVecEnv(envs_fun) dummy_env = envs_fun[0]() else: INPUT_FILE = '../data/05f2a901.json' with open(INPUT_FILE, 'r') as f: puzzle = json.load(f) envs_fun = [ lambda: gym.make('arc-v0', input=task['input'], output=task['output'], need_ui=need_ui) for task in puzzle['train'] ] #pdb.set_trace() envs_fun = envs_fun[0:1] envs = ShmemVecEnv(envs_fun) dummy_env = envs_fun[0]() env_num = len(envs_fun) torch.manual_seed(500) num_inputs = dummy_env.observation_space.shape[0] num_actions = dummy_env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) online_net = QNet(num_inputs, num_actions, cartpole_test, evalution_mode) target_net = QNet(num_inputs, num_actions, cartpole_test, evalution_mode) if (evalution_mode): online_net = torch.load('../result/arc0.model') target_net = torch.load('../result/arc0.model') update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) writer = SummaryWriter('logs') online_net.to(device) target_net.to(device) online_net.train() target_net.train() memory = Memory(replay_memory_capacity) score = 0 epsilon = 1.0 steps = 0 loss = 0 states = envs.reset() try: while True: if (need_ui): envs.render() steps += 1 global initial_exploration if (initial_exploration > 0): initial_exploration -= 1 actions = [] for state in states: state = torch.Tensor(state).to(device) state = state.unsqueeze(0) action = get_action(state, target_net, 0 if evalution_mode else epsilon, dummy_env) if (evalution_mode): print(action) actions.append(action) next_states, rewards, dones, info = envs.step(actions) #print(rewards) masks = np.zeros(envs.num_envs) for i in range(envs.num_envs): masks[i] = 0 if dones[i] else 1 for i in range(envs.num_envs): #print(rewards[i]) action_one_hot = np.zeros(dummy_env.action_space.n) action_one_hot[actions[i]] = 1 memory.push(states[i], next_states[i], action_one_hot, rewards[i], masks[i]) #score += reward states = next_states if not evalution_mode and steps > initial_exploration: epsilon -= 0.00003 epsilon = max(epsilon, 0.1) batch = memory.sample(batch_size) loss = QNet.train_model(online_net, target_net, optimizer, batch, device) if steps % update_target == 0: update_target_model(online_net, target_net) if (steps > 1028): states = envs.reset() steps = 0 print( 'new epsisode ------------------------------------------') except KeyboardInterrupt: print('save model') torch.save(target_net, '../result/arc.model') sys.exit(0)
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) img_shape = env.observation_space.shape num_actions = 3 print('image size:', img_shape) print('action size:', num_actions) net = QNet(num_actions) target_net = QNet(num_actions) update_target_model(net, target_net) optimizer = optim.RMSprop(net.parameters(), lr=0.00025, eps=0.01) writer = SummaryWriter('logs') if not os.path.isdir(args.save_path): os.makedirs(args.save_path) net.to(device) target_net.to(device) net.train() target_net.train() memory = Memory(100000) running_score = 0 epsilon = 1.0 steps = 0 for e in range(10000): done = False dead = False score = 0 avg_loss = [] start_life = 5 state = env.reset() state = pre_process(state) state = torch.Tensor(state).to(device) history = torch.stack((state, state, state, state)) for i in range(3): action = env.action_space.sample() state, reward, done, info = env.step(action) state = pre_process(state) state = torch.Tensor(state).to(device) state = state.unsqueeze(0) history = torch.cat((state, history[:-1]), dim=0) while not done: if args.render: env.render() steps += 1 qvalue = net(history.unsqueeze(0)) action = get_action(epsilon, qvalue, num_actions) next_state, reward, done, info = env.step(action + 1) next_state = pre_process(next_state) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) next_history = torch.cat((next_state, history[:-1]), dim=0) if start_life > info['ale.lives']: dead = True start_life = info['ale.lives'] score += reward reward = np.clip(reward, -1, 1) mask = 0 if dead else 1 memory.push(history.cpu(), next_history.cpu(), action, reward, mask) if dead: dead = False if steps > args.initial_exploration: epsilon -= 1e-6 epsilon = max(epsilon, 0.1) batch = memory.sample(args.batch_size) loss = train_model(net, target_net, optimizer, batch) if steps % args.update_target: update_target_model(net, target_net) else: loss = 0 avg_loss.append(loss) history = next_history if e % args.log_interval == 0: print( '{} episode | score: {:.2f} | epsilon: {:.4f} | steps: {} | loss: {:.4f}' .format(e, score, epsilon, steps, np.mean(avg_loss))) writer.add_scalar('log/score', float(score), steps) writer.add_scalar('log/score', np.mean(avg_loss), steps) if score > args.goal_score: ckpt_path = args.save_path + 'model.pth' torch.save(net.state_dict(), ckpt_path) print('running score exceeds 400 so end') break
class QTDAgent(object): def __init__(self, state_dim, action_dim, learning_rate=0.001, reward_decay=0.99, e_greedy=0.9): self.action_dim = action_dim self.state_dim = state_dim self.lr = learning_rate self.gamma = reward_decay # in according to the parameters in the formulation. self.epsilon = e_greedy self.EPS_START = 0.9 self.EPS_END = 0.05 self.EPS_DECAY = 30000 # this decay is to slow. # TO DO: figure out the relationship between the decay and the totoal step. # try to use a good strategy to solve this problem. use_cuda = torch.cuda.is_available() self.LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor self.FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor self.model = QNet(self.state_dim, self.action_dim).cuda() if use_cuda else QNet( self.state_dim, self.action_dim) self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) # self.scheduler = optim.StepLR(self.optimizer, step_size=10000, gamma=0.5) # the learning rate decrease by a factor gamma every 10000 step_size. util.weights_init(self.model) def sbc(self, v, volatile=False): return Variable(self.FloatTensor((np.expand_dims(v, 0).tolist())), volatile=volatile) def get_actions(self, state): action = self.model(self.sbc(state, volatile=True)) return action def select_action(self, state, steps_done): util.adjust_learning_rate(self.optimizer, self.lr, steps_done, 10000, lr_decay=0.2) # global steps_done sample = random.random() esp_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) * \ np.exp(-1. * steps_done / self.EPS_DECAY) if sample > esp_threshold: actions = self.get_actions(state) action = actions.data.max(1)[1].view(1, 1) return action else: return self.LongTensor([[random.randrange(self.action_dim)]]) def update(self, pending): # def update(self, s, a, r, s_, a_,done=False): pending_len = len(pending) loss = 0 while (pending_len): pending_len = pending_len - 1 [s, a, r, s_, a_, done] = pending[pending_len] if (done == True): expect_state_action_value = r else: non_final_next_states = self.model(self.sbc(s_, volatile=True)) expect_state_action_value = r + self.gamma * non_final_next_states.max( 1)[0] expect_state_action_value.volatile = False # expect_state_action_value = r + self.gamma*self.model(Variable(torch.from_numpy(np.expand_dims(s_,0).astype('float32')))).max(1)[0] state_action_value = self.model(self.sbc(s))[0, a] loss += 0.5 * (state_action_value - expect_state_action_value).pow(2) self.optimizer.zero_grad() loss.backward() # loss.backward() # for param in self.model.parameters(): # param.grad.data.clamp_(-1,1) self.optimizer.step() def save_model(self, path): torch.save(self.model.state_dict(), '{}QTDAgent.pt'.format(path)) # torch.save(self.target_critic.state_dict(), '{}/critic.pt'.format(path)) print('Models saved successfully') def load_model(self, name): self.model.load_state_dict(name)
class SAC(object): def __init__(self, input_size, action_size, gamma, tau, alpha, hidden_size, lr, device): self.gamma, self.tau, self.alpha = gamma, tau, alpha self.lr, self.device = lr, device self.policy = Actor(input_size, hidden_size, action_size).to(self.device) self.critic = QNet(input_size, hidden_size, action_size).to(self.device) self.policy_optim = torch.optim.Adam(self.policy.parameters(), lr=self.lr) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=self.lr) self.critic_target = copy.deepcopy(self.critic) self.critic_target.requires_grad_(False) @torch.no_grad() def select_action(self, obs, sample=True): obs = torch.FloatTensor(obs).to(self.device).unsqueeze(0) policy = self.policy(obs) action = Cat(raw_base=policy).sample(onehot=False, sample=sample) action = action.cpu().numpy()[0] return action def update_parameters(self, batch): obs, act, rew, done, obs_next = batch obs = torch.FloatTensor(obs).to(self.device) act = torch.LongTensor(act).unsqueeze(-1).to(self.device) rew = torch.FloatTensor(rew).unsqueeze(-1).to(self.device) done = torch.BoolTensor(done).unsqueeze(-1).to(self.device) obs_next = torch.FloatTensor(obs_next).to(self.device) with torch.no_grad(): next_policy = Cat(raw_base=self.policy(obs_next)) next_q = torch.min(*self.critic_target(obs_next)) next_eval = (next_policy.probs * next_q).sum(dim=-1, keepdim=True) next_entr = -(next_policy.probs * next_policy.logits).sum( dim=-1, keepdim=True) next_v = (next_eval + self.alpha * next_entr).masked_fill(done, 0.) q_targ = rew + self.gamma * next_v self.critic_optim.zero_grad() q1, q2 = self.critic(obs) q_pred = torch.min(q1, q2).detach() q1, q2 = q1.gather(dim=-1, index=act), q2.gather(dim=-1, index=act) critic_loss = (q1 - q_targ).pow(2.).mul(0.5) + ( q2 - q_targ).pow(2.).mul(0.5) critic_loss = critic_loss.mean() critic_loss.backward() self.critic_optim.step() with torch.no_grad(): critic_loss = (torch.min(q1, q2) - q_targ).pow(2.).mul(0.5).mean() self.policy_optim.zero_grad() policy = Cat(raw_base=self.policy(obs)) policy_entr = -(policy.probs.detach() * policy.logits).sum(dim=-1).mean() policy_eval = (policy.probs * q_pred).sum(dim=-1).mean() policy_loss = self.alpha * policy_entr - policy_eval policy_loss.backward() self.policy_optim.step() soft_update(self.critic_target, self.critic, self.tau) loss_info = { 'critic_loss': critic_loss.item(), 'policy_loss': policy_loss.item(), 'policy_entr': policy_entr.item() } return loss_info
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) state_size = env.observation_space.shape[0] action_size = env.action_space.n print('state size:', state_size) print('action size:', action_size) q_net = QNet(state_size, action_size, args) target_q_net = QNet(state_size, action_size, args) optimizer = optim.Adam(q_net.parameters(), lr=0.001) update_target_model(q_net, target_q_net) writer = SummaryWriter(args.logdir) replay_buffer = deque(maxlen=10000) running_score = 0 steps = 0 for episode in range(args.max_iter_num): done = False score = 0 state = env.reset() state = np.reshape(state, [1, state_size]) while not done: if args.render: env.render() steps += 1 q_values = q_net(torch.Tensor(state)) action = get_action(q_values, action_size, args.epsilon) next_state, reward, done, _ = env.step(action) next_state = np.reshape(next_state, [1, state_size]) reward = reward if not done or score == 499 else -1 mask = 0 if done else 1 replay_buffer.append((state, action, reward, next_state, mask)) state = next_state score += reward if steps > args.initial_exploration: args.epsilon -= args.epsilon_decay args.epsilon = max(args.epsilon, 0.1) mini_batch = random.sample(replay_buffer, args.batch_size) q_net.train(), target_q_net.train() train_model(q_net, target_q_net, optimizer, mini_batch) if steps % args.update_target == 0: update_target_model(q_net, target_q_net) score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if episode % args.log_interval == 0: print( '{} episode | running_score: {:.2f} | epsilon: {:.2f}'.format( episode, running_score, args.epsilon)) writer.add_scalar('log/score', float(score), episode) if running_score > args.goal_score: if not os.path.isdir(args.save_path): os.makedirs(args.save_path) ckpt_path = args.save_path + 'model.pth.tar' torch.save(q_net.state_dict(), ckpt_path) print('Running score exceeds 400. So end') break
class Off_policy(Algo): def __init__(self): super(Off_policy, self).__init__() self.memory = Replay_buffer(capacity=p.exploitory_policy_memory_size) self.exploratory_policy = GaussianPolicy( self.state_space, self.action_space).to(self.device) self.exploratory_Q = QNet(self.state_space, self.action_space).to(self.device) self.exploratory_Q_target = QNet(self.state_space, self.action_space).to(self.device) self.exploratory_policy_optim = Adam( self.exploratory_policy.parameters(), lr=p.lr) self.exploratory_Q_optim = Adam(self.exploratory_Q.parameters(), lr=p.lr) self.target_update(self.exploratory_policy, self.exploitory_policy, 1.0) self.kl_normalizer = Normalizer(1) self.ex_rewards_normalizer = Normalizer(1) def start(self): total_numsteps = 0 for episode in itertools.count(1): episode_rewards = 0.0 episode_steps = 0 done = False state = self.env.reset() while not done: episode_steps += 1 if p.random_steps > total_numsteps: action = self.env.action_space.sample() else: norm_state = self.obs_normalizer.normalize(state) action = self.select_action(norm_state, self.exploratory_policy) if len(self.memory) > p.exploitory_batch_size and len( self.memory) > p.exploratory_batch_size: for i in range(p.exploitory_policy_updates_per_steps): qf1_loss, qf2_loss, policy_loss, alpha_loss, alpha, ex_reward_model_loss = self.update_exploitory_policy( self.memory) if episode % p.exploitory_target_update_interval == 0: self.target_update(self.exploitory_Q_target, self.exploitory_Q, p.tau) for i in range(p.exploratory_policy_updates_per_steps): ex_qf1_loss, ex_qf2_loss, ex_policy_loss, divergence_loss = self.update_exploratory_policy( self.memory) if episode % p.exploratory_target_update_interval == 0: self.target_update(self.exploratory_Q_target, self.exploratory_Q, p.tau) next_state, reward, done, _ = self.env.step(action) total_numsteps += 1 episode_rewards += reward # Ignore the done signal if it comes from hitting the time horizon. mask = 1.0 if episode_steps == self.env._max_episode_steps else float( not done) self.memory.push((state, action, reward, next_state, mask)) self.obs_normalizer.update(state) state = next_state if episode % p.test_freq == 0: average_rewards, average_episode_steps = self.test_current_policy( ) try: data = { 'average_rewards': average_rewards, 'total_numsteps': total_numsteps, 'average_episode_steps': average_episode_steps, 'qf1_loss': qf1_loss, 'qf2_loss': qf2_loss, 'exploitory_policy_loss': policy_loss, 'alpha_loss': alpha_loss, 'alpha_value': alpha, 'ex_qf1_loss': ex_qf1_loss, 'ex_qf2_loss': ex_qf2_loss, 'ex_policy_loss': ex_policy_loss, 'ex_reward_model_loss': ex_reward_model_loss, 'divergence_loss': divergence_loss } self.log(data) except UnboundLocalError: pass if total_numsteps > p.max_numsteps: self.env.close() self.writer.close() break def update_exploratory_policy(self, memory): state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample( p.exploitory_batch_size) state_batch, next_state_batch = self.obs_normalizer.normalize( state_batch), self.obs_normalizer.normalize(next_state_batch) state_batch = torch.FloatTensor(state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) with torch.no_grad(): ex_rewards = self.ex_reward_model.get_reward( state_batch, next_state_batch) ex_rewards = ex_rewards.unsqueeze(1).cpu().numpy() ex_reward_batch = self.ex_rewards_normalizer.normalize(ex_rewards) self.ex_rewards_normalizer.update(ex_rewards) ex_reward_batch = torch.FloatTensor(ex_reward_batch).to( self.device) ex_next_state_action, ex_next_state_log_pi, _ = self.exploratory_policy.sample( next_state_batch) qf1_next_target, qf2_next_target = self.exploratory_Q_target( next_state_batch, ex_next_state_action) ''' ex_mean_actions, ex_log_std = self.exploratory_policy(next_state_batch) mean_actions, log_std = self.exploitory_policy(next_state_batch) ex_normal = Normal(ex_mean_actions, ex_log_std.exp()) normal = Normal(mean_actions, log_std.exp()) kl_div = torch.distributions.kl_divergence(ex_normal, normal).mean(1).unsqueeze(1) ''' ex_next_state_log_prob = torch.clamp( self.exploratory_policy.get_logprob(next_state_batch, ex_next_state_action), min=p.log_std_min, max=p.log_std_max) next_state_log_prob = torch.clamp( self.exploitory_policy.get_logprob(next_state_batch, ex_next_state_action), min=p.log_std_min, max=p.log_std_max) kl_div = (ex_next_state_log_prob - next_state_log_prob).mean(1).unsqueeze(1) min_qf_next_target = p.ex_alpha * ( torch.min(qf1_next_target, qf2_next_target) - (p.alpha * ex_next_state_log_pi)) - kl_div next_q_value = ex_reward_batch + mask_batch * p.gamma * ( min_qf_next_target) qf1, qf2 = self.exploratory_Q(state_batch, action_batch) qf1_loss = F.mse_loss(qf1, next_q_value) qf2_loss = F.mse_loss(qf2, next_q_value) qf_loss = qf1_loss + qf2_loss self.exploratory_Q_optim.zero_grad() qf_loss.backward() self.exploratory_Q_optim.step() ex_pi, ex_log_pi, _ = self.exploratory_policy.sample(state_batch) qf1_pi, qf2_pi = self.exploratory_Q(state_batch, ex_pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) ''' ex_mean_actions, ex_log_std = self.exploratory_policy(state_batch) mean_actions, log_std = self.exploitory_policy(state_batch) ex_normal = Normal(ex_mean_actions, ex_log_std.exp()) normal = Normal(mean_actions, log_std.exp()) kl_div = torch.distributions.kl_divergence(ex_normal, normal).mean(1).unsqueeze(1) ''' ex_state_log_prob = torch.clamp(self.exploratory_policy.get_logprob( state_batch, ex_pi), min=p.log_std_min, max=p.log_std_max) with torch.no_grad(): state_log_prob = torch.clamp(self.exploitory_policy.get_logprob( state_batch, ex_pi), min=p.log_std_min, max=p.log_std_max) kl_div = (ex_state_log_prob - state_log_prob).mean(1).unsqueeze(1) policy_loss = (p.ex_alpha * ((p.alpha * ex_log_pi) - min_qf_pi) + kl_div).mean() self.exploratory_policy_optim.zero_grad() policy_loss.backward() self.exploratory_policy_optim.step() ex_alpha_loss = torch.Tensor([0.0]) if settings.automatic_ex_entropy_tuning: ex_alpha_loss = -( self.ex_log_alpha * (ex_log_pi + self.ex_target_entropy).detach()).mean() self.ex_alpha_optim.zero_grad() ex_alpha_loss.backward() self.ex_alpha_optim.step() p.ex_alpha = self.ex_log_alpha.exp().item() return qf1_loss.item(), qf2_loss.item(), policy_loss.item( ), kl_div.mean().item()
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = 2 num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) writer = SummaryWriter('logs') online_net.to(device) target_net.to(device) online_net.train() target_net.train() memory = Memory(replay_memory_capacity) running_score = 0 epsilon = 1.0 steps = 0 loss = 0 for e in range(30000): done = False state_series = deque(maxlen=sequence_length) next_state_series = deque(maxlen=sequence_length) score = 0 state = env.reset() state = state_to_partial_observability(state) state = torch.Tensor(state).to(device) next_state_series.append(state) while not done: steps += 1 state_series.append(state) action = get_action(state_series, target_net, epsilon, env) next_state, reward, done, _ = env.step(action) next_state = state_to_partial_observability(next_state) next_state = torch.Tensor(next_state) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 action_one_hot = np.zeros(2) action_one_hot[action] = 1 if len(state_series) >= sequence_length: memory.push(state_series, next_state_series, action_one_hot, reward, mask) score += reward state = next_state if steps > initial_exploration: epsilon -= 0.000005 epsilon = max(epsilon, 0.1) batch = memory.sample(batch_size) loss = QNet.train_model(online_net, target_net, optimizer, batch) if steps % update_target == 0: update_target_model(online_net, target_net) score = score if score == 500.0 else score + 1 if running_score == 0: running_score = score else: running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( e, running_score, epsilon)) writer.add_scalar('log/score', float(running_score), e) writer.add_scalar('log/loss', float(loss), e) if running_score > goal_score: break
class Learner: def __init__(self, n_actors, device='cuda:0'): # params self.gamma = 0.99 self.alpha = 0.6 self.bootstrap_steps = 3 self.initial_exploration = 50000 self.priority_epsilon = 1e-6 self.device = device self.n_epochs = 0 self.n_actors = n_actors # path self.memory_path = os.path.join('./', 'logs', 'memory') self.net_path = os.path.join('./', 'logs', 'model', 'net.pt') self.target_net_path = os.path.join('./', 'logs', 'model', 'target_net.pt') # memory self.memory_size = 500000 self.batch_size = 128 self.memory_load_interval = 10 self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps) # net self.net_save_interval = 50 self.target_update_interval = 1000 self.net = QNet(self.net_path, self.device).to(self.device) self.target_net = QNet(self.target_net_path, self.device).to(self.device) self.target_net.load_state_dict(self.net.state_dict()) self.net.save() self.target_net.save() self.optim = optim.RMSprop(self.net.parameters(), lr=0.00025 / 4.0, alpha=0.95, eps=1.5e-7, centered=True) def run(self): while True: if self.replay_memory.size > self.initial_exploration: self.train() self.interval() def train(self): batch, index, weights = self.replay_memory.sample(self.device) # q_value q_value = self.net(batch['state']) q_value = q_value.gather(1, batch['action']) # target q_value with torch.no_grad(): next_action = torch.argmax(self.net(batch["next_state"]), 1).view(-1, 1) next_q_value = self.target_net(batch["next_state"]).gather( 1, next_action) target_q_value = batch["reward"] + ( self.gamma** self.bootstrap_steps) * next_q_value * (1 - batch['done']) # update self.optim.zero_grad() loss = torch.mean(0.5 * (q_value - target_q_value)**2) loss.backward() self.optim.step() priority = (np.abs( (q_value - target_q_value).detach().cpu().numpy()).reshape(-1) + self.priority_epsilon)**self.alpha self.replay_memory.update_priority(index, priority) def interval(self): self.n_epochs += 1 if self.n_epochs % self.target_update_interval == 0: self.target_net.load_state_dict(self.net.state_dict()) if self.n_epochs % self.net_save_interval == 0: self.net.save() self.target_net.save() if self.n_epochs % self.memory_load_interval == 0: for i in range(self.n_actors): self.replay_memory.load(self.memory_path, i)
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) writer = SummaryWriter('logs') online_net.to(device) target_net.to(device) online_net.train() target_net.train() memory = Memory_With_TDError(replay_memory_capacity) running_score = 0 epsilon = 1.0 steps = 0 beta = beta_start loss = 0 for e in range(3000): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 action = get_action(state, target_net, epsilon, env) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 action_one_hot = np.zeros(2) action_one_hot[action] = 1 memory.push(state, next_state, action_one_hot, reward, mask) score += reward state = next_state if steps > initial_exploration: epsilon -= 0.00005 epsilon = max(epsilon, 0.1) beta += 0.00005 beta = min(1, beta) batch, weights = memory.sample(batch_size, online_net, target_net, beta) loss = QNet.train_model(online_net, target_net, optimizer, batch, weights) if steps % update_target == 0: update_target_model(online_net, target_net) score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print( '{} episode | score: {:.2f} | epsilon: {:.2f} | beta: {:.2f}'. format(e, running_score, epsilon, beta)) writer.add_scalar('log/score', float(running_score), e) writer.add_scalar('log/loss', float(loss), e) if running_score > goal_score: break
class Algo: def __init__(self): #Creating environment self.env = gym.make(settings.env_name) self.env.seed(settings.seed) self.env.action_space.seed(settings.seed) self.state_space = self.env.observation_space.shape[0] self.action_space = self.env.action_space.shape[0] self.obs_normalizer = Normalizer(self.state_space) self.device = torch.device(settings.device) self.writer = SummaryWriter( 'runs/' + settings.env_name + "_" + settings.algo + '_{}_{}_{}'.format(p.alpha, p.ex_alpha, settings.seed)) #Initializing common networks and their optimizers self.exploitory_policy = GaussianPolicy( self.state_space, self.action_space).to(self.device) self.exploitory_Q = QNet(self.state_space, self.action_space).to(self.device) self.exploitory_Q_target = QNet(self.state_space, self.action_space).to(self.device) self.exploitory_policy_optim = Adam( self.exploitory_policy.parameters(), lr=p.lr) self.exploitory_Q_optim = Adam(self.exploitory_Q.parameters(), lr=p.lr) self.target_update(self.exploitory_Q_target, self.exploitory_Q, 1.0) p.alpha = torch.Tensor([p.alpha]).to(self.device) if settings.automatic_entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to( self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=p.lr) if settings.automatic_ex_entropy_tuning: self.ex_target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to( self.device)).item() self.ex_log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.ex_alpha_optim = Adam([self.log_alpha], lr=p.lr) if settings.reward_model == 'novelty': self.ex_reward_model = Novelty(self.state_space, self.device) def target_update(self, target, source, tau=p.tau): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) def update_exploitory_policy(self, memory): state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample( p.exploitory_batch_size) state_batch, next_state_batch = self.obs_normalizer.normalize( state_batch), self.obs_normalizer.normalize(next_state_batch) state_batch = torch.FloatTensor(state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.exploitory_policy.sample( next_state_batch) qf1_next_target, qf2_next_target = self.exploitory_Q_target( next_state_batch, next_state_action) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - p.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * p.gamma * ( min_qf_next_target) qf1, qf2 = self.exploitory_Q(state_batch, action_batch) qf1_loss = F.mse_loss(qf1, next_q_value) qf2_loss = F.mse_loss(qf2, next_q_value) qf_loss = qf1_loss + qf2_loss self.exploitory_Q_optim.zero_grad() qf_loss.backward() self.exploitory_Q_optim.step() pi, log_pi, _ = self.exploitory_policy.sample(state_batch) qf1_pi, qf2_pi = self.exploitory_Q(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((p.alpha * log_pi) - min_qf_pi).mean() self.exploitory_policy_optim.zero_grad() policy_loss.backward() self.exploitory_policy_optim.step() alpha_loss = torch.Tensor([0.0]) if settings.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() p.alpha = self.log_alpha.exp().item() ex_reward_model_loss = self.ex_reward_model.update(memory) return qf1_loss.item(), qf2_loss.item(), policy_loss.item( ), alpha_loss.item(), p.alpha, ex_reward_model_loss def test_current_policy(self): avg_reward = 0 avg_steps = 0 avg_ex_rewards = 0 for episode in range(p.testing_episodes): episode_steps = 0 state = self.env.reset() episode_rewards = 0 episode_ex_rewards = 0 done = False while not done: episode_steps += 1 norm_state = self.obs_normalizer.normalize(state) action = self.select_action(norm_state, self.exploitory_policy, evaluate=True) next_state, reward, done, _ = self.env.step(action) episode_rewards += reward state = next_state avg_reward += episode_rewards avg_ex_rewards += episode_ex_rewards avg_steps += episode_steps avg_reward = avg_reward / p.testing_episodes avg_ex_rewards = avg_ex_rewards / p.testing_episodes avg_steps = avg_steps / p.testing_episodes return avg_reward, avg_steps def select_action(self, state, policy, evaluate=False): with torch.no_grad(): try: state = torch.FloatTensor(state).to(self.device).unsqueeze(0) if evaluate is False: action, log_prob, _ = policy.sample(state) else: _, log_prob, action = policy.sample(state) return action.cpu().numpy()[0] except: state = state.unsqueeze(0) if evaluate is False: action, log_prob, _ = policy.sample(state) else: _, log_prob, action = policy.sample(state) return action def log(self, data): for key in data.keys(): if key != "total_numsteps": self.writer.add_scalar( key.split('_')[-1] + "/" + key, data[key], data['total_numsteps']) print("Total number of Steps: {} \t Average reward per episode: {}". format(data['total_numsteps'], round(data['average_rewards'], 1))) def start(self): raise NotImplementedError