class AbstractAgent(ABC): def __init__(self, device, folder, config): self.folder = folder self.config = config self.device = device self.memory = ReplayMemory(self.config['MEMORY_CAPACITY']) self.eval_env = NormalizedActions(gym.make(**self.config['GAME'])) self.continuous = bool(self.eval_env.action_space.shape) self.state_size = self.eval_env.observation_space.shape[0] if self.continuous: self.action_size = self.eval_env.action_space.shape[0] else: self.action_size = self.eval_env.action_space.n self.display_available = 'DISPLAY' in os.environ @abstractmethod def select_action(self, state, episode=None, evaluation=False): pass def get_batch(self): transitions = self.memory.sample(self.config['BATCH_SIZE']) batch = list(zip(*transitions)) # Divide memory into different tensors states = torch.FloatTensor(batch[0]).to(self.device) if self.continuous: actions = torch.FloatTensor(batch[1]).to(self.device) else: actions = torch.LongTensor(batch[1]).to(self.device) rewards = torch.FloatTensor(batch[2]).unsqueeze(1).to(self.device) next_states = torch.FloatTensor(batch[3]).to(self.device) done = torch.FloatTensor(batch[4]).unsqueeze(1).to(self.device) return states, actions, rewards, next_states, done @abstractmethod def optimize(self): pass def evaluate(self, n_ep=10, render=False, gif=False): rewards = [] if gif: writer = imageio.get_writer(self.folder + '/results.gif', duration=0.005) render = render and self.display_available try: for i in range(n_ep): state = self.eval_env.reset() reward = 0 done = False steps = 0 while not done and steps < self.config['MAX_STEPS']: action = self.select_action(state, evaluation=True) state, r, done, _ = self.eval_env.step(action) if render: self.eval_env.render() if i == 0 and gif: writer.append_data(self.eval_env.render(mode='rgb_array')) reward += r steps += 1 rewards.append(reward) except KeyboardInterrupt: if not render: raise finally: self.eval_env.close() if gif: print(f"Saved gif in {self.folder+'/results.gif'}") writer.close() score = sum(rewards)/len(rewards) if rewards else 0 return score @abstractmethod def save(self): pass @abstractmethod def load(self, folder=None): pass
class Plotter: def __init__(self, config, device, folder): self.device = device self.folder = folder self.config = config self.eval_env = NormalizedActions(gym.make(**config["GAME"])) self.nfig = 1 self.nfig_actor = 1 def plot_soft_actor_1D(self, soft_actor, pause=False, size=25): ss = torch.linspace(-1, 1, size).unsqueeze(1).to(self.device) mu, sigma = soft_actor.get_mu_sig(ss) mu, sigma = mu.squeeze(), sigma.squeeze() ss = ss.cpu().numpy() fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(211) ax.set_title(f"$tanh(\mu)$") ax.plot(ss, np.tanh(mu)) ax.set_xlabel('State') ax.set_ylabel('$tanh(\mu)$') ax.set_ylim(-1.05, 1.05) ax = fig.add_subplot(212) ax.set_title(f"$\sigma$") ax.plot(ss, sigma) ax.set_xlabel('State') ax.set_ylabel('$\sigma$') ax.set_ylim(-0.05, 2.05) if pause: plt.show() else: plt.savefig(self.folder + f'/Actor{self.nfig_actor:0>3}.jpg') plt.close() self.nfig_actor += 1 def plot_actor_1D(self, actor, pause=False, size=25): ss = torch.linspace(-1, 1, size).unsqueeze(1).to(self.device) a = actor(ss).detach().cpu().numpy() ss = ss.cpu().numpy() fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111) ax.set_title(f"Action as a function of the state") ax.plot(ss, a) ax.set_xlabel('State') ax.set_ylabel('Action') ax.set_ylim(-1.05, 1.05) if pause: plt.show() else: plt.savefig(self.folder + f'/Actor{self.nfig_actor:0>3}.jpg') plt.close() self.nfig_actor += 1 def plot_Q_1D(self, Qnet, pause=False, size=25): if not hasattr(self, 'xx'): x, y = np.linspace(-1, 1, size), np.linspace(-1, 1, size) self.xx, self.yy = np.meshgrid(x, y) self.s = torch.FloatTensor(x).unsqueeze(1).to(self.device) self.a = torch.FloatTensor(y).unsqueeze(1).to(self.device) Qsa = np.zeros((size, size)) with torch.no_grad(): for i in range(size): for j in range(size): Qsa[j, i] = Qnet(self.s[i], self.a[j]).detach().cpu().numpy() self.in_plot = True fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.plot_surface(self.xx, self.yy, Qsa) ax.set_title('Q*-value in state-action space') ax.set_xlabel('Position') ax.set_ylabel('Action') ax.set_zlabel('Q') # ax.set_zlim(-0.05, 1.05) if pause: plt.show() else: plt.savefig(self.folder + f'/Q{"1" if 1 else "2"}_{self.nfig:0>3}.jpg') plt.close() self.in_plot = False self.nfig += 1 def plot_soft_Q_2D(self, Qnet, soft_actor, pause=False, size=25): state = self.eval_env.reset() states = [state] done = False steps = 0 while not done and steps < self.config['MAX_STEPS']: state, r, done, _ = self.eval_env.step( soft_actor.select_action(state)) states.append(state) if pause: self.eval_env.render() steps += 1 self.eval_env.close() if not hasattr(self, 'xx'): x, y = np.linspace(-1, 1, size), np.linspace(-1, 1, size) self.xx, self.yy = np.meshgrid(x, y) self.s = torch.FloatTensor(list(itertools.product(x, y))).to( self.device) with torch.no_grad(): a, _ = soft_actor(self.s) Qsa = Qnet(self.s, a) Qsa = Qsa.cpu().numpy().reshape(size, size, order='F') a = a.cpu().numpy().reshape(size, size, self.action_size, order='F') states = np.array(states) with torch.no_grad(): s = torch.FloatTensor(states).to(self.device) aa, _ = soft_actor(s) Qsa_states = Qnet(s, aa) Qsa_states = Qsa_states.cpu().numpy().squeeze() self.in_plot = True fig = plt.figure() ax = fig.add_subplot(111, projection='3d') # ax.plot_surface(self.xx, self.yy, Qsa) ax.quiver(self.xx, self.yy, Qsa, a[:, :, 0], a[:, :, 1], 0, length=0.05, normalize=True, arrow_length_ratio=0.35) ax.plot(states[:, 0], states[:, 1], Qsa_states, c='red') ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('Q(s, $\pi$(s)') # ax.set_zlim(0, 1) if pause: plt.show() else: plt.savefig(self.folder + f'/Q{self.nfig:0>3}.jpg') plt.close() self.in_plot = False self.nfig += 1
def train(Agent, args): config = load_config(f'agents/{args.agent}/config.yaml') game = config['GAME']['id'].split('-')[0] folder = create_folder(args.agent, game, config) if args.load: config = load_config(f'{folder}/config.yaml') if args.gpu and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print(f"\033[91m\033[1mDevice : {device}\nFolder : {folder}\033[0m") # Create gym environment and agent env = NormalizedActions(gym.make(**config["GAME"])) model = Agent(device, folder, config) # Load model from a previous run if args.load: model.load(args.load) # Signal to render evaluation during training by pressing CTRL+Z def handler(sig, frame): model.evaluate(n_ep=1, render=True) # model.plot_Q(pause=True) signal.signal(signal.SIGTSTP, handler) nb_total_steps = 0 nb_episodes = 0 print("Starting training...") rewards = [] eval_rewards = [] lenghts = [] time_beginning = time.time() try: for episode in trange(config["MAX_EPISODES"]): done = False step = 0 episode_reward = 0 state = env.reset() while not done and step < config["MAX_STEPS"]: action = model.select_action(state, episode=episode) next_state, reward, done, _ = env.step(action) episode_reward += reward # Save transition into memory model.memory.push(state, action, reward, next_state, done) state = next_state losses = model.optimize() step += 1 nb_total_steps += 1 rewards.append(episode_reward) lenghts.append(step) if episode % config["FREQ_SAVE"] == 0: model.save() if episode % config["FREQ_EVAL"] == 0: eval_rewards.append(model.evaluate()) plt.cla() plt.title(folder.rsplit('/', 1)[1]) absc = range(0, len(eval_rewards * config["FREQ_EVAL"]), config["FREQ_EVAL"]) plt.plot(absc, eval_rewards) plt.savefig(f'{folder}/eval_rewards.png') if episode % config["FREQ_PLOT"] == 0: plt.cla() plt.title(folder.rsplit('/', 1)[1]) plt.plot(rewards) plt.savefig(f'{folder}/rewards.png') plt.cla() plt.title(folder.rsplit('/', 1)[1]) plt.plot(lenghts) plt.savefig(f'{folder}/lenghts.png') plt.close() nb_episodes += 1 except KeyboardInterrupt: pass finally: env.close() model.save() time_execution = time.time() - time_beginning print( '---------------------------------------------------\n' '---------------------STATS-------------------------\n' '---------------------------------------------------\n', nb_total_steps, ' steps and updates of the network done\n', nb_episodes, ' episodes done\n' 'Execution time : ', round(time_execution, 2), ' seconds\n' '---------------------------------------------------\n' 'Average nb of steps per second : ', round(nb_total_steps / time_execution, 3), 'steps/s\n' 'Average duration of one episode : ', round(time_execution / max(1, nb_episodes), 3), 's\n' '---------------------------------------------------')