def train(policy, save_name, load_count = 0, summarize=True, load_path=None, log_path = './logs'): #Minigrid maze env env_name = "MiniGrid-BlockMaze-v0" def make_env(env_name): return lambda: gym_minigrid.wrappers.PadImgObsWrapper(gym.make(env_name)) envs = [make_env(env_name) for i in range(N_ENVS)] envs = SubprocVecEnv(envs) ob_space = envs.observation_space.shape nw, nh, nc = ob_space ac_space = envs.action_space obs = envs.reset() with tf.Session() as sess: actor_critic = get_actor_critic(sess, N_ENVS, N_STEPS, ob_space, ac_space, policy, summarize) if load_path is not None: actor_critic.load(load_path) print('Loaded a2c') summary_op = tf.summary.merge_all() writer = tf.summary.FileWriter(log_path, graph=sess.graph) sess.run(tf.global_variables_initializer()) batch_ob_shape = (N_ENVS*N_STEPS, nw, nh, nc) dones = [False for _ in range(N_ENVS)] nbatch = N_ENVS * N_STEPS episode_rewards = np.zeros((N_ENVS, )) final_rewards = np.zeros((N_ENVS, )) for update in tqdm(range(load_count + 1, TOTAL_TIMESTEPS + 1)): # mb stands for mini batch mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] for n in range(N_STEPS): actions, values, _ = actor_critic.act(obs) mb_obs.append(np.copy(obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(dones) obs, rewards, dones, _ = envs.step(actions) #print(obs[0:3, :,:,0]) episode_rewards += rewards masks = 1 - np.array(dones) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks mb_rewards.append(rewards) mb_dones.append(dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).reshape(batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = actor_critic.critique(obs).tolist() #discount/bootstrap off value fn for n, (rewards, d, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() d = d.tolist() if d[-1] == 0: rewards = discount_with_dones(rewards+[value], d+[0], GAMMA)[:-1] else: rewards = discount_with_dones(rewards, d, GAMMA) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() if summarize: loss, policy_loss, value_loss, policy_entropy, _, summary = actor_critic.train(mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, update, summary_op) writer.add_summary(summary, update) else: loss, policy_loss, value_loss, policy_entropy, _ = actor_critic.train(mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, update) if update % LOG_INTERVAL == 0 or update == 1: print('%i): %.4f, %.4f, %.4f' % (update, policy_loss, value_loss, policy_entropy)) print(final_rewards.mean()) if update % SAVE_INTERVAL == 0: print('Saving model') actor_critic.save(SAVE_PATH, save_name + '_' + str(update) + '.ckpt') actor_critic.save(SAVE_PATH, save_name + '_done.ckpt')
max_frames = 12000 max_steps = 500 frame_idx = 0 rewards = [] batch_size = 128 while frame_idx < max_frames: state = envs.reset() ou_noise.reset() episode_reward = 0 for step in range(max_steps): action = policy_net.get_action(state) action = ou_noise.get_action(action, step) # print(action) next_state, reward, done, _ = envs.step(action) replay_buffer.push(state, action, reward, next_state, done) if len(replay_buffer) > batch_size: ddpg_update(batch_size) state = next_state episode_reward += reward frame_idx += 1 if frame_idx % max(1000 * NUM_PROCESS, max_steps + 1) == 0: if rewards: print(frame_idx, rewards[-1]) # plot(frame_idx, rewards) torch.save(policy_net.state_dict(), "DDPG_original_pendulum_weight.pth")
# prevents policy to become exactly 0 or 1 helps exploration # add in 1.e-10 to avoid log(0) which gives nan entropy = -(new_probs*torch.log(old_probs+1.e-10)+ \ (1.0-new_probs)*torch.log(1.0-old_probs+1.e-10)) return torch.mean(clipped_surrogate.add(entropy.mul(beta))) model = ActorCritic().to(device) #return dist, v if args.load_weight: model.load_state_dict( torch.load(f'PongDeterministic-v4_{load_weight_n}.pth')) optimizer = optim.Adam(model.parameters(), lr=lr) f1 = envs.reset() f2 = envs.step([0] * num_envs) if __name__ == "__main__": while not early_stop and frame_idx < max_frames: frame_idx += 1 print(frame_idx) if frame_idx % 100 == 0: num_steps += args.additional_num_step log_probs, states, actions, rewards, next_state, masks, values = collect_trajectories( envs, model, num_steps) scores = np.asarray(rewards).sum(axis=0) scores_list.append(scores.mean()) print("Mean:", scores.mean(), "\nRaw:", scores) # stop if any of the trajectories is done # we want all the lists to be retangular
while frame_idx < max_frames and not early_stop: i_update += 1 values = [] obs = [] acs = [] rewards = [] masks = [] entropy = 0 for current_step in range(num_steps): #print(" Current Step: {0}".format(current_step)) ac = ppo.get_action(ob) next_ob, _, done, _ = envs.step(ac) reward = [ discriminator.get_reward(np.concatenate([ob, ac], axis=1)) ] #f.write(str(reward)+'\n') #print(reward) value = ppo.get_value(ob) values.append(value) rewards.append(reward) #[:, np.newaxis]) masks.append((1 - done)) #[:, np.newaxis]) obs.append(ob) acs.append(ac) ob = next_ob
# actor1 acts in all parallel envs action_p1 = agent1.act(make_cuda(state)).squeeze(1).cpu().numpy() # actor2 acts in all parallel envs action_p2 = agent2.act(make_cuda(state)).squeeze(1).cpu().numpy() # separate actions action_tuples = [] for i in range(num_envs): actions = [] actions.append(action_p1[i]) # player1 actions.append(action_p2[i]) # player2 action_tuples.append(actions) next_observation, reward, finished, _ = envs.step(action_tuples) # pass actions to environments # separate rewards reward1 = [] reward2 = [] for i in range(num_envs): reward1.append(reward[i][player0]) # player1 reward2.append(reward[i][player1]) # player2 reward1 = torch.FloatTensor(reward1).unsqueeze(1) # player1 reward2 = torch.FloatTensor(reward2).unsqueeze(1) # player2 episode_rewards1 += reward1 # player1 episode_rewards2 += reward2 # player2 finished_masks = torch.FloatTensor(1-np.array(finished)).unsqueeze(1)
all_rewards = [] all_losses = [] state = envs.reset() state = torch.FloatTensor(np.float32(state)) rollout.states[0].copy_(state) episode_rewards = torch.zeros(num_envs, 1) final_rewards = torch.zeros(num_envs, 1) for i_update in range(num_batch): for step in range(num_steps): action = actor_critic.act(Variable(state)) next_state, reward, done, _ = envs.step( action.squeeze(1).cpu().data.numpy()) reward = torch.FloatTensor(reward).unsqueeze(1) episode_rewards += reward masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if USE_CUDA: masks = masks.cuda() state = torch.FloatTensor(np.float32(next_state)) rollout.insert(step, state, action.data, reward, masks) _, next_value = actor_critic(
envs = SubprocVecEnv([make_env(env_name) for i in range(num_envs)]) net = nn.Sequential(nn.Linear(4, 128), nn.ReLU(), nn.Linear(128, 2)) agent = Model(net, 2).to(device) solver = optim.Adam(agent.parameters()) memory = Memory(replay_memory_capacity) eps = 1.0 duration = [] frame_count = 0 lifespan = [[0] for _ in range(num_envs)] s_gotten = None while frame_count < max_frame: s = envs.reset() if s_gotten is None else s_gotten preprocessed_s = torch.FloatTensor(s) a = agent.response(preprocessed_s, eps) s_gotten, r, done, _ = envs.step(a) for i in range(num_envs): lifespan[i][-1] += 1 if done[i]: if lifespan[i][-1] < 500: r[i] = PENALTY memory.push(s[i], a[i], r[i], s_gotten[i], done[i]) duration.append(lifespan[i][-1]) lifespan[i].append(0) if lifespan[i][-1] > 0: # 500일때 버림 memory.push(s[i], a[i], r[i], s_gotten[i], done[i]) if frame_count > initial_exploration: eps -= 0.00005
obs_gotten = None while frame_count < max_frame: cache = {'obs': [], 'acts': [], 'rews': [], 'dones': []} probs_cache = {'mu': [], 'sig': []} for _ in range(n_steps): obs = envs.reset() if obs_gotten is None else obs_gotten obs_in = torch.FloatTensor(obs).to(device) mu, sig = actor(obs_in) with torch.no_grad(): a = Normal(mu, sig).sample() a.clamp_(-2.0 + 1e-7, 2.0 - 1e-7) obs_gotten, rews, dones, _ = envs.step(a) for i in range(num_envs): rewards[i][-1] += rews[i] if dones[i]: global_rewards.append(rewards[i][-1]) rewards[i].append(0.) cache['obs'].append(obs) cache['acts'].append(a) cache['rews'].append(rews * 0.1) cache['dones'].append(dones) probs_cache['mu'].append(mu) probs_cache['sig'].append(sig)
log_probs = [] values = [] states = [] actions = [] rewards = [] masks = [] entropy = 0 for _ in range(num_steps): state = torch.FloatTensor(state).to(device) dist, value = model(state) functional.reset_net(model) action = dist.sample() next_state, reward, done, _ = envs.step( torch.max(action, 1)[1].cpu().numpy()) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) states.append(state) actions.append(action) state = next_state step_idx += 1
all_rewards = [] all_losses = [] state = envs.reset() state = torch.FloatTensor(np.float32(state)) rollout.states[0].copy_(state) episode_rewards = torch.zeros(num_envs, 1) final_rewards = torch.zeros(num_envs, 1) for i_update in range(num_batch): for step in range(num_steps): action = actor_critic.act(Variable(state)) next_state, reward, done, _ = envs.step(action.squeeze(1).cpu().data.numpy()) reward = torch.FloatTensor(reward).unsqueeze(1) episode_rewards += reward masks = torch.FloatTensor(1-np.array(done)).unsqueeze(1) final_rewards *= masks final_rewards += (1-masks) * episode_rewards episode_rewards *= masks if USE_CUDA: masks = masks.cuda() state = torch.FloatTensor(np.float32(next_state)) rollout.insert(step, state, action.data, reward, masks)
def train(env_fn=None, spectrum=False, a2c_arch=None, nenvs=16, nsteps=100, max_iters=1e6, gamma=0.99, pg_coeff=1.0, vf_coeff=0.5, ent_coeff=0.01, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, log_interval=100, summarize=True, load_path=None, log_path=None, cpu_cores=1): # Construct the vectorized parallel environments envs = [env_fn for _ in range(nenvs)] envs = SubprocVecEnv(envs) # Set some random seeds for the environment envs.seed(0) if spectrum: envs.spectrum() ob_space = envs.observation_space.shape nw, nh, nc = ob_space ac_space = envs.action_space obs = envs.reset() tf_config = tf.ConfigProto(inter_op_parallelism_threads=cpu_cores, intra_op_parallelism_threads=cpu_cores) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: actor_critic = ActorCritic(sess, a2c_arch, ob_space, ac_space, pg_coeff, vf_coeff, ent_coeff, max_grad_norm, lr, alpha, epsilon, summarize) load_count = 0 if load_path is not None: actor_critic.load(load_path) print('Loaded a2c') summary_op = tf.summary.merge_all() writer = tf.summary.FileWriter(log_path, graph=sess.graph) sess.run(tf.global_variables_initializer()) batch_ob_shape = (-1, nw, nh, nc) dones = [False for _ in range(nenvs)] episode_rewards = np.zeros((nenvs, )) final_rewards = np.zeros((nenvs, )) print('a2c Training Start!') print('Model will be saved on intervals of %i' % (log_interval)) for i in tqdm(range(load_count + 1, int(max_iters) + 1), ascii=True, desc='ActorCritic'): # Create the minibatch lists mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_depth = [], [], [], [], [], [] total_reward = 0 for n in range(nsteps): # Get the actions and values from the actor critic, we don't need neglogp actions, values, neglogp = actor_critic.act(obs) mb_obs.append(np.copy(obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(dones) obs, rewards, dones, info = envs.step(actions) total_reward += np.sum(rewards) episode_rewards += rewards masks = 1 - np.array(dones) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks mb_rewards.append(rewards) mb_depth.append( np.array( [info_item['scramble_depth'] for info_item in info])) mb_dones.append(dones) # Convert batch steps to batch rollouts mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes( 1, 0).reshape(batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.float32).swapaxes(1, 0) mb_depth = np.asarray(mb_depth, dtype=np.int32).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = actor_critic.critique(obs).tolist() # discounting for n, (rewards, d, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() d = d.tolist() if d[-1] == 0: rewards = discount_with_dones(rewards + [value], d + [0], gamma)[:-1] else: rewards = discount_with_dones(rewards, d, gamma) mb_rewards[n] = rewards # Flatten the whole minibatch mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() mb_depth = mb_depth.flatten() # Save the information to tensorboard if summarize: loss, policy_loss, value_loss, policy_ent, mrew, mdp, _, summary = actor_critic.train( mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, mb_depth, i, summary_op) writer.add_summary(summary, i) else: loss, policy_loss, value_loss, policy_ent, mrew, mdp, _ = actor_critic.train( mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, mb_depth, i) if i % log_interval == 0: actor_critic.save(log_path, i) actor_critic.save(log_path, 'final') print('a2c model is finished training')
class Ppo: def __init__(self, numOfEnvs): self.testRewards = [] # self.num_envs = 16 # self.num_envs = numOfEnvs self.num_envs = 6 self.env_name = "Pendulum-v0" self.env = gym.make(self.env_name) self.envs = [self.make_env() for i in range(self.num_envs)] self.envs = SubprocVecEnv(self.envs) self.num_inputs = self.envs.observation_space.shape[0] self.num_outputs = self.envs.action_space.shape[0] #Hyper params: self.hidden_size = 256 self.lr = 3e-3 self.model = ActorCritic(self.num_inputs, self.num_outputs, self.hidden_size).to(device) self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) def make_env(self): def _thunk(): env = gym.make(self.env_name) return env return _thunk # def compute_gae(self, next_value, rewards, masks, values, gamma=0.99, tau=0.95): def compute_gae(self, next_value, rewards, masks, values, g, t): gamma = float(g) tau = float(t) values = values + [next_value] gae = 0 returns = [] for step in reversed(range(len(rewards))): delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step] gae = delta + gamma * tau * masks[step] * gae returns.insert(0, gae + values[step]) return returns def ppo_iter(self, mini_batch_size, states, actions, log_probs, returns, advantage): batch_size = states.size(0) for _ in range(batch_size // mini_batch_size): rand_ids = np.random.randint(0, batch_size, mini_batch_size) yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :] def ppo_update(self, ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=0.2): for _ in range(ppo_epochs): for state, action, old_log_probs, return_, advantage in self.ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages): dist, value = self.model(state) entropy = dist.entropy().mean() new_log_probs = dist.log_prob(action) ratio = (new_log_probs - old_log_probs).exp() surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage actor_loss = - torch.min(surr1, surr2).mean() critic_loss = (return_ - value).pow(2).mean() loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss def plot(self, frame_idx, rewards): clear_output(True) plt.figure(figsize=(20,5)) plt.subplot(131) plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1])) plt.plot(rewards) plt.show() # plt.savefig("{0}/{1}_rewardGraph.png".format(saveGraphPath, frame_idx)) def test_env(self, vis=False): state = self.env.reset() if vis: self.env.render() done = False total_reward = 0 while not done: state = torch.FloatTensor(state).unsqueeze(0).to(device) dist, _ = self.model(state) next_state, reward, done, _ = self.env.step(dist.sample().cpu().numpy()[0]) state = next_state if vis: self.env.render() total_reward += reward return total_reward def main(self, inputVals): gam = inputVals[0] lam = inputVals[1] print ("Gam: ", gam) print ("Lam: ", lam) num_inputs = self.envs.observation_space.shape[0] num_outputs = self.envs.action_space.shape[0] #Hyper params: # hidden_size = 256 # lr = 3e-3 num_steps = 20 mini_batch_size = 5 ppo_epochs = 4 threshold_reward = -200 # model = a.ActorCritic(num_inputs, num_outputs, hidden_size).to(device) # optimizer = optim.Adam(self.model.parameters(), lr=lr) max_frames = 12000 # max_frames = 2000 frame_idx = 0 self.test_rewards = [] state = self.envs.reset() early_stop = False while frame_idx < max_frames and not early_stop: log_probs = [] values = [] states = [] actions = [] rewards = [] masks = [] entropy = 0 for _ in range(num_steps): state = torch.FloatTensor(state).to(device) dist, value = self.model(state) action = dist.sample() next_state, reward, done, _ = self.envs.step(action.cpu().numpy()) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) states.append(state) actions.append(action) state = next_state frame_idx += 1 if frame_idx % 1000 == 0: test_reward = np.mean([self.test_env() for _ in range(10)]) self.test_rewards.append(test_reward) self.plot(frame_idx, self.test_rewards) if test_reward > threshold_reward: early_stop = True print ("rewards: ", test_reward) next_state = torch.FloatTensor(next_state).to(device) _, next_value = self.model(next_state) returns = self.compute_gae(next_value, rewards, masks, values, gam, lam) returns = torch.cat(returns).detach() log_probs = torch.cat(log_probs).detach() values = torch.cat(values).detach() states = torch.cat(states) actions = torch.cat(actions) advantage = returns - values lastLoss = self.ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage) # print ("loss: ", [lastLoss]) # re = rewards[-1].cpu() # print ("RE: ", np.asarray(re)) # return (np.asarray(re)) return lastLoss.item()
# advanced 학습에 사용할 객체 rollouts에 첫번째 상태로 현재 상태를 저장 rollouts.observations[0].copy_(current_obs) # 주 반복문 for j in tqdm(range(NUM_UPDATES)): # advanced 학습 범위에 들어가는 단계마다 반복 for step in range(NUM_ADVANCED_STEP): # 행동을 결정 with torch.no_grad(): action = actor_critic.act(rollouts.observations[step]) cpu_actions = action.squeeze(1).cpu().numpy() # tensor를 NumPy 변수로 # 1단계를 병렬로 실행, 반환값 obs의 크기는 (16, 1, 84, 84) obs, reward, done, info = envs.step(cpu_actions) # 보상을 텐서로 변환한 다음 에피소드 총 보상에 더함 # 크기가 (16,)인 것을 (16, 1)로 변환 reward = np.expand_dims(np.stack(reward), 1) reward = torch.from_numpy(reward).float() episode_rewards += reward # 각 프로세스마다 done이 True이면 0, False이면 1 masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) # 마지막 에피소드의 총 보상을 업데이트 final_rewards *= masks # done이 True이면 0을 곱하고, False이면 1을 곱하여 리셋 # done이 False이면 0을 더하고, True이면 epicodic_rewards를 더함 final_rewards += (1 - masks) * episode_rewards
state = envs.reset() while frame_idx < max_frames: log_probs = [] values = [] rewards = [] masks = [] entropy = 0 for _ in range(num_steps): state = torch.FloatTensor(state).to(device) dist, value = model(state) action = dist.sample() next_state, reward, done, _ = envs.step(action.cpu().numpy()) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) state = next_state frame_idx += 1 if frame_idx % 1000 == 0: test_rewards.append(np.mean([test_env() for _ in range(10)])) # plot(frame_idx, test_rewards)
def main(): envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) num_inputs = envs.observation_space.shape[0] num_outputs = envs.action_space.n model = ActorCritic(num_inputs, num_outputs, hidden_size, hd2_size).to(device) optimizer = optim.Adam(model.parameters()) max_frames = 10000 frame_idx = 0 test_rewards = [] state = envs.reset() while frame_idx < max_frames: log_probs = [] values = [] rewards = [] masks = [] entropy = 0 for _ in range(num_steps): state = torch.FloatTensor(state).to(device) dist, value = model(state) action = dist.sample() next_state, reward, done, _ = envs.step(action.cpu().numpy()) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) state = next_state frame_idx += 1 next_state = torch.FloatTensor(next_state).to(device) _, next_value = model(next_state) returns = compute_returns(next_value, rewards, masks) log_probs = torch.cat(log_probs) returns = torch.cat(returns).detach() values = torch.cat(values) advantage = returns - values actor_loss = -(log_probs * advantage.detach()).mean() critic_loss = advantage.pow(2).mean() loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy print(f'\rframe: {frame_idx}\t loss: {loss}', end='') if frame_idx % 100 == 0: rewards, scores = map( list, zip(*((test_env(model, False) for _ in range(10))))) avg_rewards = np.mean(rewards) avg_scores = np.mean(scores) print( f'\rframe: {frame_idx}\t avg_rewards: {avg_rewards:.2f}\t avg_scores: {avg_scores:.2f}\t loss: {loss}' ) optimizer.zero_grad() loss.backward() optimizer.step() ((test_env(model, True) for _ in range(10))) envs.close()
duration = [] frame_count = 0 lifespan = [[0] for _ in range(num_envs)] s_gotten = None while frame_count * n_step < max_frame: obs_l, acts_l, rews_l, dones_l, probs_l = [], [], [], [], [] accept_sample = [True for _ in range(num_envs)] for _ in range(n_step): obs = envs.reset() if s_gotten is None else s_gotten obs_in = torch.FloatTensor(obs).to(device) prob = actor(obs_in) with torch.no_grad(): a = prob.multinomial(num_samples=1) s_gotten, rews, dones, _ = envs.step(a.view(-1).numpy()) for i in range(num_envs): lifespan[i][-1] += 1 if dones[i]: if lifespan[i][-1] < 500: rews[i] = PENALTY else: # 500번째 accept_sample[i] = False print(lifespan[i][-1], critic(obs_in[[i], :]).view(-1).item()) duration.append(lifespan[i][-1]) lifespan[i].append(0) obs_l.append(obs) acts_l.append(a)
while not d: print('-------------------------------------------------') print('Current Observation') envs.render(0) time.sleep(0.1) a, v, neg = actor_critic.act(obs, stochastic=True) print('') print('action: ', actions[a[0]]) print('value: ', v) print('neglogp: ', neg) print('pd: ') for ac, pd in zip(actions, actor_critic.step_model.logits(obs)[0][0]): print('\t', ac, pd) obs, r, d, sbo = envs.step(a) print('r: ', r) envs.render(0) time.sleep(0.1) if not d: im = plt.imshow(cube_gym.onehotToRGB(obs[0])) ims.append([im]) else: print('DONE') im = plt.imshow(cube_gym.onehotToRGB(sbo[0])) ims.append([im]) d = d[0] print(r)