Beispiel #1
0
class TrainDQN:
    def __init__(self,
                 env,
                 sess,
                 learning_rate=1e-3,
                 seed=1234,
                 gamma=0.99,
                 max_eps=1.0,
                 min_eps=0.1,
                 render=False,
                 print_freq=20,
                 load_path=None,
                 save_path=None,
                 batch_size=32,
                 log_dir='logs/train',
                 max_steps=100000,
                 buffer_capacity=None,
                 max_episode_len=2000,
                 eps_decay_rate=-0.0001,
                 target_update_freq=1000,
                 ):
        """Trains an openai gym-like environment with deep q learning.
        Args:
            env: gym.Env where our agent resides
            seed: Random seed for reproducibility
            gamma: Discount factor
            max_eps: Starting exploration factor
            min_eps: Exploration factor to decay towards
            max_episode_len: Maximum length of an individual episode
            render: True to render the environment, else False
            print_freq: Displays logging information every 'print_freq' episodes
            load_path: (str) Path to load existing model from
            save_path: (str) Path to save model during training
            max_steps: maximum number of times to sample the environment
            buffer_capacity: How many state, action, next state, reward tuples the replay buffer should store
            max_episode_len: Maximum number of timesteps in an episode
            eps_decay_rate: lambda parameter in exponential decay for epsilon
            target_update_fraction: Fraction of max_steps update the target network
        """
        np.random.seed(seed)
        self.sess = sess
        self.env = env
        self.input_dim = env.observation_space.shape[0]
        self.output_dim = env.action_space.n
        self.max_steps = max_steps
        self.max_eps = max_eps
        self.min_eps = min_eps
        self.eps_decay_rate = eps_decay_rate
        self.max_episode_len = max_episode_len
        self.render = render
        self.print_freq = print_freq
        self.rewards = []
        self.metrics = []
        self.save_path = save_path
        self.load_path = load_path
        self.batch_size = batch_size
        self.num_updates = 0
        self.gamma = gamma
        self.buffer = ReplayBuffer(capacity=max_steps // 2 if buffer_capacity is None else buffer_capacity)
        self.target_update_freq = target_update_freq
        self.learning_rate = learning_rate

        with tf.variable_scope('q_network'):
            self.q_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,))
        with tf.variable_scope('target_network'):
            self.target_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,))
        self.update_target_network = [old.assign(new) for (new, old) in
                                      zip(tf.trainable_variables('q_network'),
                                          tf.trainable_variables('target_network'))]
        if self.load_path is not None:
            self.load()

        self.add_summaries(log_dir)

    def add_summaries(self, log_dir):
        tf.summary.scalar('Loss', self.q_network.loss, )
        tf.summary.scalar('Mean Estimated Value', tf.reduce_mean(self.q_network.output_pred))
        # Merge all the summaries and write them out to log_dir
        self.merged = tf.summary.merge_all()
        self.train_writer = tf.summary.FileWriter(log_dir, self.sess.graph)

    def learn(self):
        """Learns via Deep-Q-Networks (DQN)"""
        obs = self.env.reset()
        mean_reward = None
        total_reward = 0
        ep = 0
        ep_len = 0
        rand_actions = 0
        for t in range(self.max_steps):
            # weight decay from https://jaromiru.com/2016/10/03/lets-make-a-dqn-implementation/
            eps = self.min_eps + (self.max_eps - self.min_eps) * np.exp(
                self.eps_decay_rate * t)
            if self.render:
                self.env.render()

            # Take exploratory action with probability epsilon
            if np.random.uniform() < eps:
                action = self.env.action_space.sample()
                rand_actions += 1
            else:
                action = self.act(obs)

            # Execute action in emulator and observe reward and next state
            new_obs, reward, done, info = self.env.step(action)
            total_reward += reward

            # Store transition s_t, a_t, r_t, s_t+1 in replay buffer
            self.buffer.add((obs, action, reward, new_obs, done))

            # Perform learning step
            self.update()

            obs = new_obs
            ep_len += 1
            if done or ep_len >= self.max_episode_len:
                #         print("Episode Length:", ep_len)
                #         print(f"Episode {ep} Reward:{total_reward}")
                #         print(f"Random Action Percent: {rand_actions/ep_len}")
                ep += 1
                ep_len = 0
                rand_actions = 0
                self.rewards.append(total_reward)
                total_reward = 0
                obs = self.env.reset()

                if ep % self.print_freq == 0 and ep > 0:
                    new_mean_reward = np.mean(self.rewards[-self.print_freq - 1:])

                    print(f"-------------------------------------------------------")
                    print(f"Mean {self.print_freq} Episode Reward: {new_mean_reward}")
                    print(f"Exploration fraction: {eps}")
                    print(f"Total Episodes: {ep}")
                    print(f"Total timesteps: {t}")
                    print(f"-------------------------------------------------------")

                    # Add reward summary
                    summary = tf.Summary()
                    summary.value.add(tag=f'Mean {self.print_freq} Episode Reward',
                                      simple_value=new_mean_reward)
                    summary.value.add(tag=f'Epsilon', simple_value=eps)
                    self.train_writer.add_summary(summary, self.num_updates)

                    # Model saving inspired by Open AI Baseline implementation
                    if (mean_reward is None or new_mean_reward >= mean_reward) and self.save_path is not None:
                        print(f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}")
                        print(f'Location: {self.save_path}')
                        # save_path = f"{self.save_path}_model"
                        self.save()
                        mean_reward = new_mean_reward

    def act(self, observation):
        """Takes an action given the observation.
        Args:
            observation: observation from the environment
        Returns:
            integer index of the selected action
        """
        pred = self.sess.run([self.q_network.output_pred],
                             feed_dict={self.q_network.input_ph: np.reshape(observation, (1, self.input_dim))})
        return np.argmax(pred)

    def update(self):
        """Applies gradients to the Q network computed from a minibatch of self.batch_size."""
        if self.batch_size <= self.buffer.size():
            self.num_updates += 1

            # Update the Q network with model parameters from the target network
            if self.num_updates % self.target_update_freq == 0:
                self.sess.run(self.update_target_network)
                print('Updated Target Network')

            # Sample random minibatch of transitions from the replay buffer
            sample = self.buffer.sample(self.batch_size)
            states, action, reward, next_states, done = sample

            # Calculate discounted predictions for the subsequent states using target network
            next_state_pred = self.gamma * self.sess.run(self.target_network.output_pred,
                                                         feed_dict={
                                                             self.target_network.input_ph: next_states}, )

            # Adjust the targets for non-terminal states
            reward = reward.reshape(len(reward), 1)
            targets = reward
            loc = np.argwhere(done != True).flatten()
            if len(loc) > 0:
                max_q = np.amax(next_state_pred, axis=1)
                targets[loc] = np.add(
                    targets[loc],
                    max_q[loc].reshape(max_q[loc].shape[0], 1),
                    casting='unsafe')

            # Update discount factor and train model on batch
            _, loss = self.sess.run([self.q_network.opt, self.q_network.loss],
                                    feed_dict={self.q_network.input_ph: states,
                                               self.q_network.target_ph: targets.flatten(),
                                               self.q_network.action_indices_ph: action})

    def save(self):
        """Saves the Q network."""
        self.q_network.saver.save(self.sess, self.save_path)

    def load(self):
        """Loads the Q network."""
        self.q_network.saver.restore(self.sess, self.save_path)

    def plot_rewards(self, path=None):
        """Plots rewards per episode.
        Args:
            path: Location to save the rewards plot. If None, image will be displayed with plt.show()
        """
        plt.plot(self.rewards)
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        if path is None:
            plt.show()
        else:
            plt.savefig(path)
            plt.close('all')
Beispiel #2
0
class NeuralNetworkAgent(Agent):
    def __init__(self,
                 api,
                 network_class,
                 sess,
                 save_path,
                 history_size=15,
                 restore_path=None,
                 verbose=False,
                 train=False,
                 test=False):
        super(NeuralNetworkAgent, self).__init__(api, verbose=verbose)

        # currently 7500 w/ 1000

        # Network
        self.network = network_class(sess,
                                     save_path,
                                     restore_path=restore_path,
                                     hist_size=history_size)
        self.replay_buffer = ReplayBuffer(max_size=2500)
        self.train = train
        self.history_size = history_size

        # Internal
        self.launched = False
        self.placed_move = False
        self.ctr = 0
        self.restart_game = 1
        self.game_restarted = True
        self.show_board = False
        self.last_move = -2
        self.start_state = np.zeros((20, 10, 1))
        self.possible_moves = [-1, 0, 6, 7]
        self.training_begun = False if not test else True
        self.epsilon = 1. if not test else 0
        self.decay = 0.999
        self.test = test

        self.prev_states = [self.start_state] * self.history_size

    def _controller_listener(self):
        piece_id = self.api.peekCPU(0x0042)
        game_state = self.api.peekCPU(0x0048)

        if piece_id != 19 and game_state == 1:
            # Train
            if self.train and self.replay_buffer.size(
            ) > 250 and not self.test:
                batch = self.replay_buffer.sample(batch_sz=250)
                self.network.train(batch)
                self.training_begun = True

                self.epsilon *= self.decay
                if self.epsilon < 0.010:
                    self.epsilon = 0.010

        if not self.placed_move:  # and (random_move >= 0 or self.restart_game > 0):
            # os.system('clear')
            print '--------------'
            is_random = False
            move = None
            if np.random.random() < self.epsilon or not self.training_begun:
                move = np.random.choice(self.possible_moves)
                is_random = True
            else:
                tensor = np.dstack([self.grid] + self.prev_states)
                pred = self.network.predict(tensor)[0]
                move = self.possible_moves[pred]

            if self.restart_game > 0:
                self.api.writeGamepad(0, 3, True)
                self.restart_game -= 1
                move = -2
            else:
                if move >= 0:
                    self.api.writeGamepad(0, move, True)
            self.placed_move = True
            self.show_board = True

            if self.last_move != -2 and piece_id != 19:
                print 'Random:', is_random
                S = self.grid.copy()
                self._update_board(self.api.peekCPU(0x0042))
                board = self._simulate_piece_drop(self.api.peekCPU(0x0042))
                n_empty = self._count_empty(self.grid)
                n_holes = self._count_holes(self.grid)
                height = self._count_height(board)
                levelness = self._determine_levelness(board)
                A = self.last_move
                # R  = self._count_total() + self._get_score() - n_empty
                #R = (-50 * height) + (-20 * n_holes) + (self._get_score())
                if height <= 2:
                    R = 1000
                else:
                    R = -200 * height
                R += -20 * n_holes + 10 * levelness  # 10 * self._get_score()
                SP = self.grid.copy()

                self.prev_states.insert(0, S)

                print np.dstack(self.prev_states).shape

                self.replay_buffer.add(
                    np.dstack(self.prev_states), self.possible_moves.index(A),
                    R, np.dstack([SP] + self.prev_states[:self.history_size]))

                self.prev_states = self.prev_states[:self.history_size]

                print self.epsilon
                self._print_transition(S, A, board, R)

            self.last_move = move
        else:
            self.placed_move = False

    def _frame_render_finished(self):
        """
        Renders the board the the current piece
        TODO: do this lazily, so we aren't calling read too often O_o
        """

        # To make things easier, we're going to modify the next piece drop
        # Always drop a certain type of block (currently square).
        self.api.writeCPU(0x00bf, 0x0a)

        piece_id = self.api.peekCPU(0x0042)
        game_state = self.api.peekCPU(0x0048)

        # Restart the game
        if piece_id == 19 and (game_state == 10 or game_state == 0):
            self.prev_states = [self.start_state] * self.history_size
            self.game_restarted = True
            self.restart_game = 1
            return

        # Probably a line clear... Skip
        if piece_id == 19 and game_state != 1:
            return

    def _piece_update(self, access_type, address, value):
        """
        Can be used to control the piece being dropped
        """
        if self.api.readCPU(0x0048) == 1:
            return 0x0a
        return value

    def agent_name(self):
        return 'NeuralNetworkAgent'