예제 #1
0
    def __init__(self, config, device, folder):
        self.device = device
        self.folder = folder
        self.config = config

        self.eval_env = NormalizedActions(gym.make(**config["GAME"]))

        self.nfig = 1
        self.nfig_actor = 1
예제 #2
0
    def __init__(self, device, folder, config):

        self.folder = folder
        self.config = config
        self.device = device
        self.memory = ReplayMemory(self.config['MEMORY_CAPACITY'])
        self.eval_env = NormalizedActions(gym.make(**self.config['GAME']))
        self.continuous = bool(self.eval_env.action_space.shape)

        self.state_size = self.eval_env.observation_space.shape[0]
        if self.continuous:
            self.action_size = self.eval_env.action_space.shape[0]
        else:
            self.action_size = self.eval_env.action_space.n

        self.display_available = 'DISPLAY' in os.environ
예제 #3
0
class Plotter:
    def __init__(self, config, device, folder):
        self.device = device
        self.folder = folder
        self.config = config

        self.eval_env = NormalizedActions(gym.make(**config["GAME"]))

        self.nfig = 1
        self.nfig_actor = 1

    def plot_soft_actor_1D(self, soft_actor, pause=False, size=25):
        ss = torch.linspace(-1, 1, size).unsqueeze(1).to(self.device)
        mu, sigma = soft_actor.get_mu_sig(ss)
        mu, sigma = mu.squeeze(), sigma.squeeze()
        ss = ss.cpu().numpy()

        fig = plt.figure(figsize=(10, 10))
        ax = fig.add_subplot(211)
        ax.set_title(f"$tanh(\mu)$")
        ax.plot(ss, np.tanh(mu))
        ax.set_xlabel('State')
        ax.set_ylabel('$tanh(\mu)$')
        ax.set_ylim(-1.05, 1.05)
        ax = fig.add_subplot(212)
        ax.set_title(f"$\sigma$")
        ax.plot(ss, sigma)
        ax.set_xlabel('State')
        ax.set_ylabel('$\sigma$')
        ax.set_ylim(-0.05, 2.05)
        if pause:
            plt.show()
        else:
            plt.savefig(self.folder + f'/Actor{self.nfig_actor:0>3}.jpg')
        plt.close()

        self.nfig_actor += 1

    def plot_actor_1D(self, actor, pause=False, size=25):
        ss = torch.linspace(-1, 1, size).unsqueeze(1).to(self.device)
        a = actor(ss).detach().cpu().numpy()
        ss = ss.cpu().numpy()

        fig = plt.figure(figsize=(10, 10))
        ax = fig.add_subplot(111)
        ax.set_title(f"Action as a function of the state")
        ax.plot(ss, a)
        ax.set_xlabel('State')
        ax.set_ylabel('Action')
        ax.set_ylim(-1.05, 1.05)
        if pause:
            plt.show()
        else:
            plt.savefig(self.folder + f'/Actor{self.nfig_actor:0>3}.jpg')
        plt.close()

        self.nfig_actor += 1

    def plot_Q_1D(self, Qnet, pause=False, size=25):
        if not hasattr(self, 'xx'):
            x, y = np.linspace(-1, 1, size), np.linspace(-1, 1, size)
            self.xx, self.yy = np.meshgrid(x, y)

            self.s = torch.FloatTensor(x).unsqueeze(1).to(self.device)
            self.a = torch.FloatTensor(y).unsqueeze(1).to(self.device)

        Qsa = np.zeros((size, size))
        with torch.no_grad():
            for i in range(size):
                for j in range(size):
                    Qsa[j, i] = Qnet(self.s[i],
                                     self.a[j]).detach().cpu().numpy()

        self.in_plot = True
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        ax.plot_surface(self.xx, self.yy, Qsa)
        ax.set_title('Q*-value in state-action space')
        ax.set_xlabel('Position')
        ax.set_ylabel('Action')
        ax.set_zlabel('Q')
        # ax.set_zlim(-0.05, 1.05)
        if pause:
            plt.show()
        else:
            plt.savefig(self.folder +
                        f'/Q{"1" if 1 else "2"}_{self.nfig:0>3}.jpg')
        plt.close()
        self.in_plot = False

        self.nfig += 1

    def plot_soft_Q_2D(self, Qnet, soft_actor, pause=False, size=25):
        state = self.eval_env.reset()
        states = [state]
        done = False
        steps = 0
        while not done and steps < self.config['MAX_STEPS']:
            state, r, done, _ = self.eval_env.step(
                soft_actor.select_action(state))
            states.append(state)
            if pause:
                self.eval_env.render()
            steps += 1
        self.eval_env.close()

        if not hasattr(self, 'xx'):
            x, y = np.linspace(-1, 1, size), np.linspace(-1, 1, size)
            self.xx, self.yy = np.meshgrid(x, y)
            self.s = torch.FloatTensor(list(itertools.product(x, y))).to(
                self.device)

        with torch.no_grad():
            a, _ = soft_actor(self.s)
            Qsa = Qnet(self.s, a)

        Qsa = Qsa.cpu().numpy().reshape(size, size, order='F')
        a = a.cpu().numpy().reshape(size, size, self.action_size, order='F')

        states = np.array(states)
        with torch.no_grad():
            s = torch.FloatTensor(states).to(self.device)
            aa, _ = soft_actor(s)
            Qsa_states = Qnet(s, aa)
        Qsa_states = Qsa_states.cpu().numpy().squeeze()

        self.in_plot = True
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        # ax.plot_surface(self.xx, self.yy, Qsa)
        ax.quiver(self.xx,
                  self.yy,
                  Qsa,
                  a[:, :, 0],
                  a[:, :, 1],
                  0,
                  length=0.05,
                  normalize=True,
                  arrow_length_ratio=0.35)
        ax.plot(states[:, 0], states[:, 1], Qsa_states, c='red')
        ax.set_xlabel('X')
        ax.set_ylabel('Y')
        ax.set_zlabel('Q(s, $\pi$(s)')
        # ax.set_zlim(0, 1)
        if pause:
            plt.show()
        else:
            plt.savefig(self.folder + f'/Q{self.nfig:0>3}.jpg')
        plt.close()
        self.in_plot = False

        self.nfig += 1
예제 #4
0
class AbstractAgent(ABC):

    def __init__(self, device, folder, config):

        self.folder = folder
        self.config = config
        self.device = device
        self.memory = ReplayMemory(self.config['MEMORY_CAPACITY'])
        self.eval_env = NormalizedActions(gym.make(**self.config['GAME']))
        self.continuous = bool(self.eval_env.action_space.shape)

        self.state_size = self.eval_env.observation_space.shape[0]
        if self.continuous:
            self.action_size = self.eval_env.action_space.shape[0]
        else:
            self.action_size = self.eval_env.action_space.n

        self.display_available = 'DISPLAY' in os.environ

    @abstractmethod
    def select_action(self, state, episode=None, evaluation=False):
        pass

    def get_batch(self):

        transitions = self.memory.sample(self.config['BATCH_SIZE'])
        batch = list(zip(*transitions))

        # Divide memory into different tensors
        states = torch.FloatTensor(batch[0]).to(self.device)
        if self.continuous:
            actions = torch.FloatTensor(batch[1]).to(self.device)
        else:
            actions = torch.LongTensor(batch[1]).to(self.device)
        rewards = torch.FloatTensor(batch[2]).unsqueeze(1).to(self.device)
        next_states = torch.FloatTensor(batch[3]).to(self.device)
        done = torch.FloatTensor(batch[4]).unsqueeze(1).to(self.device)

        return states, actions, rewards, next_states, done

    @abstractmethod
    def optimize(self):
        pass

    def evaluate(self, n_ep=10, render=False, gif=False):
        rewards = []
        if gif:
            writer = imageio.get_writer(self.folder + '/results.gif', duration=0.005)
        render = render and self.display_available

        try:
            for i in range(n_ep):
                state = self.eval_env.reset()
                reward = 0
                done = False
                steps = 0
                while not done and steps < self.config['MAX_STEPS']:
                    action = self.select_action(state, evaluation=True)
                    state, r, done, _ = self.eval_env.step(action)
                    if render:
                        self.eval_env.render()
                    if i == 0 and gif:
                        writer.append_data(self.eval_env.render(mode='rgb_array'))
                    reward += r
                    steps += 1
                rewards.append(reward)

        except KeyboardInterrupt:
            if not render:
                raise

        finally:
            self.eval_env.close()
            if gif:
                print(f"Saved gif in {self.folder+'/results.gif'}")
                writer.close()

        score = sum(rewards)/len(rewards) if rewards else 0
        return score

    @abstractmethod
    def save(self):
        pass

    @abstractmethod
    def load(self, folder=None):
        pass
예제 #5
0
def train(Agent, args):
    config = load_config(f'agents/{args.agent}/config.yaml')

    game = config['GAME']['id'].split('-')[0]
    folder = create_folder(args.agent, game, config)

    if args.load:
        config = load_config(f'{folder}/config.yaml')

    if args.gpu and torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    print(f"\033[91m\033[1mDevice : {device}\nFolder : {folder}\033[0m")

    # Create gym environment and agent
    env = NormalizedActions(gym.make(**config["GAME"]))
    model = Agent(device, folder, config)

    # Load model from a previous run
    if args.load:
        model.load(args.load)

    # Signal to render evaluation during training by pressing CTRL+Z
    def handler(sig, frame):
        model.evaluate(n_ep=1, render=True)
        # model.plot_Q(pause=True)

    signal.signal(signal.SIGTSTP, handler)

    nb_total_steps = 0
    nb_episodes = 0

    print("Starting training...")
    rewards = []
    eval_rewards = []
    lenghts = []
    time_beginning = time.time()

    try:
        for episode in trange(config["MAX_EPISODES"]):

            done = False
            step = 0
            episode_reward = 0

            state = env.reset()

            while not done and step < config["MAX_STEPS"]:

                action = model.select_action(state, episode=episode)

                next_state, reward, done, _ = env.step(action)
                episode_reward += reward

                # Save transition into memory
                model.memory.push(state, action, reward, next_state, done)
                state = next_state

                losses = model.optimize()

                step += 1
                nb_total_steps += 1

            rewards.append(episode_reward)
            lenghts.append(step)

            if episode % config["FREQ_SAVE"] == 0:
                model.save()

            if episode % config["FREQ_EVAL"] == 0:
                eval_rewards.append(model.evaluate())

                plt.cla()
                plt.title(folder.rsplit('/', 1)[1])
                absc = range(0, len(eval_rewards * config["FREQ_EVAL"]),
                             config["FREQ_EVAL"])
                plt.plot(absc, eval_rewards)
                plt.savefig(f'{folder}/eval_rewards.png')

            if episode % config["FREQ_PLOT"] == 0:

                plt.cla()
                plt.title(folder.rsplit('/', 1)[1])
                plt.plot(rewards)
                plt.savefig(f'{folder}/rewards.png')

                plt.cla()
                plt.title(folder.rsplit('/', 1)[1])
                plt.plot(lenghts)
                plt.savefig(f'{folder}/lenghts.png')

                plt.close()

            nb_episodes += 1

    except KeyboardInterrupt:
        pass

    finally:
        env.close()
        model.save()

    time_execution = time.time() - time_beginning

    print(
        '---------------------------------------------------\n'
        '---------------------STATS-------------------------\n'
        '---------------------------------------------------\n',
        nb_total_steps, ' steps and updates of the network done\n',
        nb_episodes, ' episodes done\n'
        'Execution time : ', round(time_execution, 2), ' seconds\n'
        '---------------------------------------------------\n'
        'Average nb of steps per second : ',
        round(nb_total_steps / time_execution, 3), 'steps/s\n'
        'Average duration of one episode : ',
        round(time_execution / max(1, nb_episodes), 3), 's\n'
        '---------------------------------------------------')