class Trainer: def __init__(self): #Preparing envs self.envs = Envs() self.memory = ReplayBuffer() self.device = torch.device(settings.device) self.policy = Policy().to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=p.lr) self.critic = QNetwork().to(self.device) self.critic_target = QNetwork().to(self.device) self.critic_optim = Adam(self.critic.parameters(), lr=p.lr) self.parameter_update(tau=1.0) if settings.mode == "test": self.policy.load_state_dict( torch.load("policy_seed_{}".format(settings.seed))) self.logger = Logger() def start(self): self.total_numsteps = 0 if settings.mode == "train": self.add_random_steps() names = torch.FloatTensor( [i for i, _ in enumerate(settings.env_names)]).to(self.device) while self.total_numsteps < p.max_numsteps: self.run_test() leg_starts, states = self.envs.reset() for step in range(p._max_episode_steps): self.total_numsteps += 1 actions = self.select_action(leg_starts, states, names) next_states, rewards, dones = self.envs.step(actions) self.memory.push(names, leg_starts, states, next_states, actions, rewards, dones) states = self.envs.reset_dones(next_states, dones) c1_loss, c2_loss, policy_loss = self.update_nets() if (self.total_numsteps % 10) == 0: self.logger.show_update(self.total_numsteps) torch.save(self.policy.state_dict(), "policy_seed_{}".format(settings.seed)) else: print("Seed: {}".format(settings.seed)) self.run_test() def run_test(self): if settings.mode == "test": print("\nTesting current policy") leg_starts, states = self.envs.reset() done_filter = epsd_rewards = torch.FloatTensor( [1.0] * len(settings.env_names)).to(self.device) epsd_rewards = torch.FloatTensor([0.0] * len(settings.env_names)).to( self.device) names = torch.FloatTensor( [i for i, _ in enumerate(settings.env_names)]).to(self.device) for step in range(p._max_episode_steps): actions = self.select_action(leg_starts, states, names, evaluate=True) next_states, rewards, dones = self.envs.step(actions) epsd_rewards += done_filter * rewards done_filter *= (dones != 1).float() states = next_states self.logger.add_rewards(len(names), epsd_rewards, self.total_numsteps) self.logger.save() def add_random_steps(self): print("Adding random steps") leg_starts, states = self.envs.reset() names = torch.FloatTensor( [i for i, _ in enumerate(settings.env_names)]).to(self.device) while len(self.memory) <= p.batch_size * 10: actions = self.envs.sample_actions() next_states, rewards, dones = self.envs.step(actions) self.memory.push(names, leg_starts, states, next_states, actions, rewards, dones) states = self.envs.reset_dones(next_states, dones) def select_action(self, leg_starts, states, names, evaluate=False): with torch.no_grad(): if not evaluate: actions, _, _ = self.policy.sample(leg_starts, states, names) else: _, _, actions = self.policy.sample(leg_starts, states, names) return actions.cpu() def parameter_update(self, tau=p.tau): for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) def update_nets(self): names_batch, leg_starts_batch, state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.memory.sample( ) reward_batch = reward_batch.unsqueeze(1) mask_batch = mask_batch.unsqueeze(1) with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample( leg_starts_batch, next_state_batch, names_batch) qf1_next_target, qf2_next_target = self.critic_target( leg_starts_batch, next_state_batch, next_state_action, names_batch) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - p.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * p.gamma * ( min_qf_next_target) qf1, qf2 = self.critic(leg_starts_batch, state_batch, action_batch, names_batch) qf1_loss = F.mse_loss(qf1, next_q_value) qf2_loss = F.mse_loss(qf2, next_q_value) qf_loss = qf1_loss + qf2_loss self.critic_optim.zero_grad() qf_loss.backward() self.critic_optim.step() pi, log_pi, _ = self.policy.sample(leg_starts_batch, state_batch, names_batch) qf1_pi, qf2_pi = self.critic(leg_starts_batch, state_batch, pi, names_batch) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((p.alpha * log_pi) - min_qf_pi).mean() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() self.parameter_update() return qf1_loss.item(), qf2_loss.item(), policy_loss.item()
# Load World # ---------- LEFT = 5 RIGHT = 4 env = gym.make('PongDeterministic-v4') print("List of available actions: ", env.unwrapped.get_action_meanings()) # Load Agent # ---------- from agent import Policy import torch.optim as optim agent = Policy().to(device) optimizer = optim.Adam(agent.parameters(), lr=1e-4) # Load Parallel Environment # ------------------------- from pong_utils import parallelEnv, preprocess_batch envs = parallelEnv('PongDeterministic-v4', n=4, seed=12345) # def collect_trajectories(envs, agent, tmax=200, nrand=5): ''' Collect trajectories of multiple agents of a parallelized environment ''' n = len(envs.ps)
# envs = SubprocVecEnv(envs) envs = ShmemVecEnv(envs) envs = VecToTensor(envs) date = datetime.now().strftime('%m_%d_%H_%M') mon_file_name = "./tmp/" + date envs = VecMonitor(envs, mon_file_name) train_policy = Policy(84, 84, 4, envs.action_space.n).to(device) step_policy = Policy(84, 84, 4, envs.action_space.n).to(device) step_policy.load_state_dict(train_policy.state_dict()) step_policy.eval() runner = Runner(envs, step_policy, n_step, gamma) optimizer = optim.RMSprop(train_policy.parameters(), lr=lr, alpha=alpha, eps=epsilon) for i in tqdm(range(num_updates)): mb_obs, mb_rewards, mb_values, mb_actions = runner.run() action_logits, values = train_policy(mb_obs) mb_adv = mb_rewards - mb_values dist = Categorical(logits=action_logits) action_log_probs = dist.log_prob(mb_actions) pg_loss = torch.mean(-action_log_probs * mb_adv) vf_loss = F.mse_loss(values, mb_rewards)
import torch.optim as optim import torch import gym from agent import Policy from collections import deque import numpy as np env = gym.make('CartPole-v1') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") policy = Policy().to(device) optimizer = optim.Adam(policy.parameters(), lr=1e-2) n_episodes = 2000 max_t = 1000 gamma = 1.0 print_every = 100 scores_deque = deque(maxlen=100) scores = [] for i_episode in range(1, n_episodes+1): saved_log_probs = [] rewards = [] state = env.reset() for t in range(max_t): action, log_prob = policy.act(state) saved_log_probs.append(log_prob) state, reward, done, _ = env.step(action) rewards.append(reward) if done: