def main(load_path, num_episode): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') n_env = 1 env_id = 'Breakout-v0' envs = [make_env(env_id) for _ in range(n_env)] envs = DummyVecEnv(envs) envs = VecToTensor(envs) policy = Policy(84, 84, 4, envs.action_space.n).to(device) policy.load_state_dict(torch.load(load_path, map_location=device)) policy.eval() for i in tqdm(range(num_episode)): obs = envs.reset() total_rewards = 0 while True: action_logits, values = policy(obs) actions = choose_action(action_logits) next_obs, rewards, dones, info = envs.step(actions) total_rewards += rewards envs.render() if dones: break print('--------------------' + str(total_rewards.item()) + '-------------------') envs.close()
def test(env_name, episodes, params, render): # Create a Gym environment env = gym.make(env_name) # Get dimensionalities of actions and observations action_space_dim = env.action_space.shape[-1] observation_space_dim = env.observation_space.shape[-1] # Instantiate agent and its policy policy = Policy(observation_space_dim, action_space_dim) policy.load_state_dict(params) agent = Agent(policy) test_reward, test_len = 0, 0 for ep in range(episodes): done = False observation = env.reset() while not done: # Similar to the training loop above - # get the action, act on the environment, save total reward # (evaluation=True makes the agent always return what it thinks to be # the best action - there is no exploration at this point) action, _ = agent.get_action(observation, evaluation=True) observation, reward, done, info = env.step( action.detach().cpu().numpy()) if render: env.render() test_reward += reward test_len += 1 print("Average test reward:", test_reward / episodes, "episode length:", test_len / episodes)
def main(args): # Create a Gym environment env = gym.make(args.env) # Exercise 1 # TODO: For CartPole-v0 - maximum episode length env._max_episode_steps = 1000 # Get dimensionalities of actions and observations action_space_dim = get_space_dim(env.action_space) observation_space_dim = get_space_dim(env.observation_space) # Instantiate agent and its policy policy = Policy(observation_space_dim, action_space_dim) agent = Agent(policy) # Print some stuff print("Environment:", args.env) print("Training device:", agent.train_device) print("Observation space dimensions:", observation_space_dim) print("Action space dimensions:", action_space_dim) # If no model was passed, train a policy from scratch. # Otherwise load the policy from the file and go directly to testing. if args.test is None: training_history = train(args.position, agent, env, args.train_episodes, False, args.render_training) # Save the model tt = str(datetime.datetime.now().date()) + "-" + str( datetime.datetime.now().hour) + "-" + str( datetime.datetime.now().minute) model_file = "%s_params.mdl" % (args.env + tt + "vel") torch.save(policy.state_dict(), model_file) print("Model saved to", model_file) # Plot rewards sns.lineplot(x="episode", y="reward", data=training_history) sns.lineplot(x="episode", y="mean_reward", data=training_history) plt.legend(["Reward", "100-episode average"]) plt.title("Reward history (%s)" % args.env) # time and day of plot plt.savefig("train_history" + tt + "vel" + ".jpg") plt.show() print("Training finished.") else: print("Loading model from", args.test, "...") state_dict = torch.load(args.test) policy.load_state_dict(state_dict) print("Testing...") test(args.position, agent, env, args.train_episodes, args.render_test)
def main(args): # Create a Gym environment env = gym.make(args.env) # Exercise 1 env._max_episode_steps = args.episode_length # Get dimensionalities of actions and observations action_space_dim = get_space_dim(env.action_space) observation_space_dim = get_space_dim(env.observation_space) # Instantiate agent and its policy policy = Policy(observation_space_dim, action_space_dim) agent = Agent(policy) # Print some stuff print("Environment:", args.env) print("Training device:", agent.train_device) print("Observation space dimensions:", observation_space_dim) print("Action space dimensions:", action_space_dim) # If no model was passed, train a policy from scratch. # Otherwise load the policy from the file and go directly to testing. if args.test is None: training_history = train(agent, env, args.train_episodes, False, args.render_training, x0=args.x0, args=args, policy=policy) # Save the model model_file = "%s_params.mdl" % args.env torch.save(policy.state_dict(), model_file) print("Model saved to", model_file) # Plot rewards sns.lineplot(x="episode", y="reward", data=training_history) sns.lineplot(x="episode", y="mean_reward", data=training_history) plt.legend(["Reward", "100-episode average"]) plt.title("Reward history (%s)" % args.env) plt.show() print("Training finished.") else: print("Loading model from", args.test, "...") state_dict = torch.load(args.test) policy.load_state_dict(state_dict) print("Testing...") test(agent, env, args.train_episodes, args.render_test, x0=args.x0)
class Trainer: def __init__(self): #Preparing envs self.envs = Envs() self.memory = ReplayBuffer() self.device = torch.device(settings.device) self.policy = Policy().to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=p.lr) self.critic = QNetwork().to(self.device) self.critic_target = QNetwork().to(self.device) self.critic_optim = Adam(self.critic.parameters(), lr=p.lr) self.parameter_update(tau=1.0) if settings.mode == "test": self.policy.load_state_dict( torch.load("policy_seed_{}".format(settings.seed))) self.logger = Logger() def start(self): self.total_numsteps = 0 if settings.mode == "train": self.add_random_steps() names = torch.FloatTensor( [i for i, _ in enumerate(settings.env_names)]).to(self.device) while self.total_numsteps < p.max_numsteps: self.run_test() leg_starts, states = self.envs.reset() for step in range(p._max_episode_steps): self.total_numsteps += 1 actions = self.select_action(leg_starts, states, names) next_states, rewards, dones = self.envs.step(actions) self.memory.push(names, leg_starts, states, next_states, actions, rewards, dones) states = self.envs.reset_dones(next_states, dones) c1_loss, c2_loss, policy_loss = self.update_nets() if (self.total_numsteps % 10) == 0: self.logger.show_update(self.total_numsteps) torch.save(self.policy.state_dict(), "policy_seed_{}".format(settings.seed)) else: print("Seed: {}".format(settings.seed)) self.run_test() def run_test(self): if settings.mode == "test": print("\nTesting current policy") leg_starts, states = self.envs.reset() done_filter = epsd_rewards = torch.FloatTensor( [1.0] * len(settings.env_names)).to(self.device) epsd_rewards = torch.FloatTensor([0.0] * len(settings.env_names)).to( self.device) names = torch.FloatTensor( [i for i, _ in enumerate(settings.env_names)]).to(self.device) for step in range(p._max_episode_steps): actions = self.select_action(leg_starts, states, names, evaluate=True) next_states, rewards, dones = self.envs.step(actions) epsd_rewards += done_filter * rewards done_filter *= (dones != 1).float() states = next_states self.logger.add_rewards(len(names), epsd_rewards, self.total_numsteps) self.logger.save() def add_random_steps(self): print("Adding random steps") leg_starts, states = self.envs.reset() names = torch.FloatTensor( [i for i, _ in enumerate(settings.env_names)]).to(self.device) while len(self.memory) <= p.batch_size * 10: actions = self.envs.sample_actions() next_states, rewards, dones = self.envs.step(actions) self.memory.push(names, leg_starts, states, next_states, actions, rewards, dones) states = self.envs.reset_dones(next_states, dones) def select_action(self, leg_starts, states, names, evaluate=False): with torch.no_grad(): if not evaluate: actions, _, _ = self.policy.sample(leg_starts, states, names) else: _, _, actions = self.policy.sample(leg_starts, states, names) return actions.cpu() def parameter_update(self, tau=p.tau): for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) def update_nets(self): names_batch, leg_starts_batch, state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.memory.sample( ) reward_batch = reward_batch.unsqueeze(1) mask_batch = mask_batch.unsqueeze(1) with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample( leg_starts_batch, next_state_batch, names_batch) qf1_next_target, qf2_next_target = self.critic_target( leg_starts_batch, next_state_batch, next_state_action, names_batch) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - p.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * p.gamma * ( min_qf_next_target) qf1, qf2 = self.critic(leg_starts_batch, state_batch, action_batch, names_batch) qf1_loss = F.mse_loss(qf1, next_q_value) qf2_loss = F.mse_loss(qf2, next_q_value) qf_loss = qf1_loss + qf2_loss self.critic_optim.zero_grad() qf_loss.backward() self.critic_optim.step() pi, log_pi, _ = self.policy.sample(leg_starts_batch, state_batch, names_batch) qf1_pi, qf2_pi = self.critic(leg_starts_batch, state_batch, pi, names_batch) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((p.alpha * log_pi) - min_qf_pi).mean() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() self.parameter_update() return qf1_loss.item(), qf2_loss.item(), policy_loss.item()
def train(episodes, player, opponent): target_dqn = Policy(observation_space_dim, action_space_dim) target_dqn.load_state_dict(policy.state_dict()) #Stacked preprocessed frames stacked_frames = deque(np.zeros((200, 210)), maxlen=4) #Updates update_counter = 0 #Memory Initialisation # take random actions to fill the memory memory = Memory(memory_size, batch_size) for i in range(memory_size): if (i == 0): obs = env.reset() state, stacked_frames = stack_frame(stacked_frames, obs[0], True) action1 = random.randint(0, 3) action2 = random.randint(0, 3) next_obs, rewards, done, info = env.step((action1, action2)) next_state, stacked_frames = stack_frame(stacked_frames, next_obs[0]) memory.store((state, action1, rewards[0], next_state, done)) state = next_state player.reset_score() opponent.reset_score() ''' Training ''' for i in range(0, episodes): done = False obs = env.reset() state, stacked_frames = stack_frame(stacked_frames, obs[0], True) timesteps = 0 reward_sum = 0 while not done: action1 = player.get_action(state, epsilon) action2 = opponent.get_action() next_obs, rewards, done, info = env.step((action1, action2)) next_state, stacked_frames = stack_frame(stacked_frames, next_obs[0]) memory.store((state, action1, rewards[0], next_state, done)) reward_sum += rewards[0] obs = next_obs state = next_state env.render() #Updating policy #Loading from memory samples = memory.sample() batch_states = np.asarray([x[0] for x in samples]) batch_actions = np.asarray([x[1] for x in samples]) batch_rewards = np.asarray([x[2] for x in samples]) batch_next_states = np.asarray([x[3] for x in samples]) batch_done = np.asarray([x[4] for x in samples]) #Target network batch = torch.from_numpy(batch_next_states.squeeze()).float().to( player.train_device) batch_t_q_values = target_dqn.forward(batch) #Q Learning batch_t_q_max, _ = batch_t_q_values.max(dim=1) y = torch.empty(batch_size, 1) batch_rewards = torch.from_numpy(batch_rewards).float().to( player.train_device) for j in range(batch_size): #.any() ? if batch_done[j].any(): y[j] = batch_rewards[j] else: y[j] = batch_rewards[j] + batch_t_q_max[j].mul(gamma) y.detach() #Gradient_descent batch_q_values = policy.forward( torch.from_numpy(batch_states.squeeze()).float().to( player.train_device)) loss = torch.mean(y.sub(batch_q_values)**2) loss.backward() player.update_policy() update_counter += 1 if (update_counter % update_step == 0): target_dqn.load_state_dict(policy.state_dict()) timesteps += 1 epsilon = epsilon * decay print( "Episode {} finished. Total reward: {:.3g} ({} timesteps)".format( i, reward_sum, timesteps))
next_obs, rewards, done, info = env.step((action1, action2)) next_state, stacked_frames = stack_frame(stacked_frames, next_obs[0]) reward_sum += rewards[0] obs = next_obs state = next_state env.render() # If no model was passed, train a policy from scratch. # Otherwise load the policy from the file and go directly to testing. if args.test is None: try: train(episodes, player, opponent) # Handle Ctrl+C - save model and go to tests except KeyboardInterrupt: print("Interrupted!") model_file = "%dqn.mdl" % args.env torch.save(policy.state_dict(), model_file) print("Model saved to", model_file) else: state_dict = torch.load(args.test) policy.load_state_dict(state_dict) print("Testing...") test(100, player, opponent) env.end()
epsilon = 1e-05 env_id = 'Breakout-v0' envs = [make_env(env_id) for _ in range(n_env)] # envs = DummyVecEnv(envs) # envs = SubprocVecEnv(envs) envs = ShmemVecEnv(envs) envs = VecToTensor(envs) date = datetime.now().strftime('%m_%d_%H_%M') mon_file_name = "./tmp/" + date envs = VecMonitor(envs, mon_file_name) train_policy = Policy(84, 84, 4, envs.action_space.n).to(device) step_policy = Policy(84, 84, 4, envs.action_space.n).to(device) step_policy.load_state_dict(train_policy.state_dict()) step_policy.eval() runner = Runner(envs, step_policy, n_step, gamma) optimizer = optim.RMSprop(train_policy.parameters(), lr=lr, alpha=alpha, eps=epsilon) for i in tqdm(range(num_updates)): mb_obs, mb_rewards, mb_values, mb_actions = runner.run() action_logits, values = train_policy(mb_obs) mb_adv = mb_rewards - mb_values